2 Author: petr.danecek@sanger
3 gcc -Wall -Winline -g -O2 -I ~/git/samtools bamcheck.c -o bamcheck -lm -lz -L ~/git/samtools -lbam -lpthread
5 Assumptions, approximations and other issues:
6 - GC-depth graph does not split reads, the starting position determines which bin is incremented.
7 There are small overlaps between bins (max readlen-1). However, the bins are big (20k).
8 - coverage distribution ignores softclips and deletions
9 - some stats require sorted BAMs
10 - GC content graph can have an untidy, step-like pattern when BAM contains multiple read lengths.
11 - 'bases mapped' (stats->nbases_mapped) is calculated from read lengths given by BAM (core.l_qseq)
12 - With the -t option, the whole reads are used. Except for the number of mapped bases (cigar)
13 counts, no splicing is done, no indels or soft clips are considered, even small overlap is
14 good enough to include the read in the stats.
18 #define BAMCHECK_VERSION "2012-09-04"
20 #define _ISOC99_SOURCE
33 #include "sam_header.h"
36 #define BWA_MIN_RDLEN 35
37 #define IS_PAIRED(bam) ((bam)->core.flag&BAM_FPAIRED && !((bam)->core.flag&BAM_FUNMAP) && !((bam)->core.flag&BAM_FMUNMAP))
38 #define IS_UNMAPPED(bam) ((bam)->core.flag&BAM_FUNMAP)
39 #define IS_REVERSE(bam) ((bam)->core.flag&BAM_FREVERSE)
40 #define IS_MATE_REVERSE(bam) ((bam)->core.flag&BAM_FMREVERSE)
41 #define IS_READ1(bam) ((bam)->core.flag&BAM_FREAD1)
42 #define IS_READ2(bam) ((bam)->core.flag&BAM_FREAD2)
43 #define IS_DUP(bam) ((bam)->core.flag&BAM_FDUP)
47 int32_t line_len, line_blen;
52 KHASH_MAP_INIT_STR(kh_faidx, faidx1_t)
53 KHASH_MAP_INIT_STR(kh_bam_tid, int)
54 KHASH_MAP_INIT_STR(kh_rg, const char *)
59 khash_t(kh_faidx) *hash;
69 // For coverage distribution, a simple pileup
78 typedef struct { uint32_t from, to; } pos_t;
89 int trim_qual; // bwa trim quality
91 // Dimensions of the quality histogram holder (quals_1st,quals_2nd), GC content holder (gc_1st,gc_2nd),
92 // insert size histogram holder
93 int nquals; // The number of quality bins
94 int nbases; // The maximum sequence length the allocated array can hold
95 int nisize; // The maximum insert size that the allocated array can hold
96 int ngc; // The size of gc_1st and gc_2nd
97 int nindels; // The maximum indel length for indel distribution
99 // Arrays for the histogram data
100 uint64_t *quals_1st, *quals_2nd;
101 uint64_t *gc_1st, *gc_2nd;
102 uint64_t *isize_inward, *isize_outward, *isize_other;
103 uint64_t *acgt_cycles;
104 uint64_t *read_lengths;
105 uint64_t *insertions, *deletions;
106 uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd;
108 // The extremes encountered
109 int max_len; // Maximum read length
110 int max_qual; // Maximum quality
111 float isize_main_bulk; // There are always some unrealistically big insert sizes, report only the main part
116 uint64_t total_len_dup;
120 uint64_t nreads_unmapped;
121 uint64_t nreads_unpaired;
122 uint64_t nreads_paired;
124 uint64_t nbases_mapped;
125 uint64_t nbases_mapped_cigar;
126 uint64_t nbases_trimmed; // bwa trimmed bases
127 uint64_t nmismatches;
129 // GC-depth related data
130 uint32_t ngcd, igcd; // The maximum number of GC depth bins and index of the current bin
131 gc_depth_t *gcd; // The GC-depth bins holder
132 int gcd_bin_size; // The size of GC-depth bin
133 uint32_t gcd_ref_size; // The approximate size of the genome
134 int32_t tid, gcd_pos; // Position of the current bin
135 int32_t pos; // Position of the last read
137 // Coverage distribution related data
138 int ncov; // The number of coverage bins
139 uint64_t *cov; // The coverage frequencies
140 int cov_min,cov_max,cov_step; // Minimum, maximum coverage and size of the coverage bins
141 round_buffer_t cov_rbuf; // Pileup round buffer
143 // Mismatches by read cycle
144 uint8_t *rseq_buf; // A buffer for reference sequence to check the mismatches against
145 int mrseq_buf; // The size of the buffer
146 int32_t rseq_pos; // The coordinate of the first base in the buffer
147 int32_t nrseq_buf; // The used part of the buffer
148 uint64_t *mpc_buf; // Mismatches per cycle
154 int nregions, reg_from,reg_to;
158 int flag_require, flag_filter;
159 double sum_qual; // For calculating average quality value
161 khash_t(kh_rg) *rg_hash; // Read groups to include, the array is null-terminated
162 faidx_t *fai; // Reference sequence for GC-depth graph
163 int argc; // Command line arguments to be printed on the output
168 void error(const char *format, ...);
169 void bam_init_header_hash(bam_header_t *header);
170 int is_in_regions(bam1_t *bam_line, stats_t *stats);
173 // Coverage distribution methods
174 inline int coverage_idx(int min, int max, int n, int step, int depth)
182 return 1 + (depth - min) / step;
185 inline int round_buffer_lidx2ridx(int offset, int size, int64_t refpos, int64_t pos)
187 return (offset + (pos-refpos) % size) % size;
190 void round_buffer_flush(stats_t *stats, int64_t pos)
194 if ( pos==stats->cov_rbuf.pos )
197 int64_t new_pos = pos;
198 if ( pos==-1 || pos - stats->cov_rbuf.pos >= stats->cov_rbuf.size )
200 // Flush the whole buffer, but in sequential order,
201 pos = stats->cov_rbuf.pos + stats->cov_rbuf.size - 1;
204 if ( pos < stats->cov_rbuf.pos )
205 error("Expected coordinates in ascending order, got %ld after %ld\n", pos,stats->cov_rbuf.pos);
207 int ifrom = stats->cov_rbuf.start;
208 int ito = round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos-1);
211 for (ibuf=ifrom; ibuf<stats->cov_rbuf.size; ibuf++)
213 if ( !stats->cov_rbuf.buffer[ibuf] )
215 idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
217 stats->cov_rbuf.buffer[ibuf] = 0;
221 for (ibuf=ifrom; ibuf<=ito; ibuf++)
223 if ( !stats->cov_rbuf.buffer[ibuf] )
225 idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
227 stats->cov_rbuf.buffer[ibuf] = 0;
229 stats->cov_rbuf.start = (new_pos==-1) ? 0 : round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos);
230 stats->cov_rbuf.pos = new_pos;
233 void round_buffer_insert_read(round_buffer_t *rbuf, int64_t from, int64_t to)
235 if ( to-from >= rbuf->size )
236 error("The read length too big (%d), please increase the buffer length (currently %d)\n", to-from+1,rbuf->size);
237 if ( from < rbuf->pos )
238 error("The reads are not sorted (%ld comes after %ld).\n", from,rbuf->pos);
241 ifrom = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,from);
242 ito = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,to);
245 for (ibuf=ifrom; ibuf<rbuf->size; ibuf++)
246 rbuf->buffer[ibuf]++;
249 for (ibuf=ifrom; ibuf<=ito; ibuf++)
250 rbuf->buffer[ibuf]++;
253 // Calculate the number of bases in the read trimmed by BWA
254 int bwa_trim_read(int trim_qual, uint8_t *quals, int len, int reverse)
256 if ( len<BWA_MIN_RDLEN ) return 0;
258 // Although the name implies that the read cannot be trimmed to more than BWA_MIN_RDLEN,
259 // the calculation can in fact trim it to (BWA_MIN_RDLEN-1). (bwa_trim_read in bwa/bwaseqio.c).
260 int max_trimmed = len - BWA_MIN_RDLEN + 1;
261 int l, sum=0, max_sum=0, max_l=0;
263 for (l=0; l<max_trimmed; l++)
265 sum += trim_qual - quals[ reverse ? l : len-1-l ];
270 // This is the correct way, but bwa clips from some reason one base less
279 void count_indels(stats_t *stats,bam1_t *bam_line)
281 int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
282 int is_1st = IS_READ1(bam_line) ? 1 : 0;
285 int read_len = bam_line->core.l_qseq;
286 for (icig=0; icig<bam_line->core.n_cigar; icig++)
288 // Conversion from uint32_t to MIDNSHP
291 int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
292 int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
296 int idx = is_fwd ? icycle : read_len-icycle-ncig;
298 error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle);
299 if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%d %s\n", idx,stats->nbases, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
301 stats->ins_cycles_1st[idx]++;
303 stats->ins_cycles_2nd[idx]++;
305 if ( ncig<=stats->nindels )
306 stats->insertions[ncig-1]++;
311 int idx = is_fwd ? icycle-1 : read_len-icycle-1;
312 if ( idx<0 ) continue; // discard meaningless deletions
313 if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases);
315 stats->del_cycles_1st[idx]++;
317 stats->del_cycles_2nd[idx]++;
318 if ( ncig<=stats->nindels )
319 stats->deletions[ncig-1]++;
322 if ( cig!=3 && cig!=5 )
327 void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line)
329 int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
330 int icig,iread=0,icycle=0;
331 int iref = bam_line->core.pos - stats->rseq_pos;
332 int read_len = bam_line->core.l_qseq;
333 uint8_t *read = bam1_seq(bam_line);
334 uint8_t *quals = bam1_qual(bam_line);
335 uint64_t *mpc_buf = stats->mpc_buf;
336 for (icig=0; icig<bam_line->core.n_cigar; icig++)
338 // Conversion from uint32_t to MIDNSHP
341 int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
342 int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
357 // Soft-clips are present in the sequence, but the position of the read marks a start of non-clipped sequence
367 // Ignore H and N CIGARs. The letter are inserted e.g. by TopHat and often require very large
368 // chunk of refseq in memory. Not very frequent and not noticable in the stats.
369 if ( cig==3 || cig==5 ) continue;
371 error("TODO: cigar %d, %s:%d %s\n", cig,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
373 if ( ncig+iref > stats->nrseq_buf )
374 error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->nrseq_buf, bam1_qname(bam_line),stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1);
377 for (im=0; im<ncig; im++)
379 uint8_t cread = bam1_seqi(read,iread);
380 uint8_t cref = stats->rseq_buf[iref];
386 int idx = is_fwd ? icycle : read_len-icycle-1;
387 if ( idx>stats->max_len )
388 error("mpc: %d>%d\n",idx,stats->max_len);
389 idx = idx*stats->nquals;
390 if ( idx>=stats->nquals*stats->nbases )
391 error("FIXME: mpc_buf overflow\n");
394 else if ( cref && cread && cref!=cread )
396 uint8_t qual = quals[iread] + 1;
397 if ( qual>=stats->nquals )
398 error("TODO: quality too high %d>=%d\n", quals[iread],stats->nquals);
400 int idx = is_fwd ? icycle : read_len-icycle-1;
401 if ( idx>stats->max_len )
402 error("mpc: %d>%d\n",idx,stats->max_len);
404 idx = idx*stats->nquals + qual;
405 if ( idx>=stats->nquals*stats->nbases )
406 error("FIXME: mpc_buf overflow\n");
417 void read_ref_seq(stats_t *stats,int32_t tid,int32_t pos)
419 khash_t(kh_faidx) *h;
423 faidx_t *fai = stats->fai;
426 chr = stats->sam->header->target_name[tid];
428 // ID of the sequence name
429 iter = kh_get(kh_faidx, h, chr);
430 if (iter == kh_end(h))
431 error("No such reference sequence [%s]?\n", chr);
432 val = kh_value(h, iter);
434 // Check the boundaries
436 error("Was the bam file mapped with the reference sequence supplied?"
437 " A read mapped beyond the end of the chromosome (%s:%d, chromosome length %d).\n", chr,pos,val.len);
438 int size = stats->mrseq_buf;
439 // The buffer extends beyond the chromosome end. Later the rest will be filled with N's.
440 if (size+pos > val.len) size = val.len-pos;
442 // Position the razf reader
443 razf_seek(fai->rz, val.offset + pos / val.line_blen * val.line_len + pos % val.line_blen, SEEK_SET);
445 uint8_t *ptr = stats->rseq_buf;
447 while ( nread<size && razf_read(fai->rz,&c,1) && !fai->rz->z_err )
452 // Conversion between uint8_t coding and ACGT
455 if ( c=='A' || c=='a' )
457 else if ( c=='C' || c=='c' )
459 else if ( c=='G' || c=='g' )
461 else if ( c=='T' || c=='t' )
468 if ( nread < stats->mrseq_buf )
470 memset(ptr,0, stats->mrseq_buf - nread);
471 nread = stats->mrseq_buf;
473 stats->nrseq_buf = nread;
474 stats->rseq_pos = pos;
478 float fai_gc_content(stats_t *stats, int pos, int len)
481 int i = pos - stats->rseq_pos, ito = i + len;
482 assert( i>=0 && ito<=stats->nrseq_buf );
488 c = stats->rseq_buf[i];
494 else if ( c==1 || c==8 )
497 return count ? (float)gc/count : 0;
500 void realloc_rseq_buffer(stats_t *stats)
502 int n = stats->nbases*10;
503 if ( stats->gcd_bin_size > n ) n = stats->gcd_bin_size;
504 if ( stats->mrseq_buf<n )
506 stats->rseq_buf = realloc(stats->rseq_buf,sizeof(uint8_t)*n);
507 stats->mrseq_buf = n;
511 void realloc_gcd_buffer(stats_t *stats, int seq_len)
513 if ( seq_len >= stats->gcd_bin_size )
514 error("The --GC-depth bin size (%d) is set too low for the read length %d\n", stats->gcd_bin_size, seq_len);
516 int n = 1 + stats->gcd_ref_size / (stats->gcd_bin_size - seq_len);
517 if ( n <= stats->igcd )
518 error("The --GC-depth bin size is too small or reference genome too big; please decrease the bin size or increase the reference length\n");
520 if ( n > stats->ngcd )
522 stats->gcd = realloc(stats->gcd, n*sizeof(gc_depth_t));
524 error("Could not realloc GCD buffer, too many chromosomes or the genome too long?? [%u %u]\n", stats->ngcd,n);
525 memset(&(stats->gcd[stats->ngcd]),0,(n-stats->ngcd)*sizeof(gc_depth_t));
529 realloc_rseq_buffer(stats);
532 void realloc_buffers(stats_t *stats, int seq_len)
534 int n = 2*(1 + seq_len - stats->nbases) + stats->nbases;
536 stats->quals_1st = realloc(stats->quals_1st, n*stats->nquals*sizeof(uint64_t));
537 if ( !stats->quals_1st )
538 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
539 memset(stats->quals_1st + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
541 stats->quals_2nd = realloc(stats->quals_2nd, n*stats->nquals*sizeof(uint64_t));
542 if ( !stats->quals_2nd )
543 error("Could not realloc buffers, the sequence too long: %d (2x%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
544 memset(stats->quals_2nd + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
546 if ( stats->mpc_buf )
548 stats->mpc_buf = realloc(stats->mpc_buf, n*stats->nquals*sizeof(uint64_t));
549 if ( !stats->mpc_buf )
550 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
551 memset(stats->mpc_buf + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
554 stats->acgt_cycles = realloc(stats->acgt_cycles, n*4*sizeof(uint64_t));
555 if ( !stats->acgt_cycles )
556 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*4*sizeof(uint64_t));
557 memset(stats->acgt_cycles + stats->nbases*4, 0, (n-stats->nbases)*4*sizeof(uint64_t));
559 stats->read_lengths = realloc(stats->read_lengths, n*sizeof(uint64_t));
560 if ( !stats->read_lengths )
561 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
562 memset(stats->read_lengths + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
564 stats->insertions = realloc(stats->insertions, n*sizeof(uint64_t));
565 if ( !stats->insertions )
566 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
567 memset(stats->insertions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
569 stats->deletions = realloc(stats->deletions, n*sizeof(uint64_t));
570 if ( !stats->deletions )
571 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
572 memset(stats->deletions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
574 stats->ins_cycles_1st = realloc(stats->ins_cycles_1st, (n+1)*sizeof(uint64_t));
575 if ( !stats->ins_cycles_1st )
576 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
577 memset(stats->ins_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
579 stats->ins_cycles_2nd = realloc(stats->ins_cycles_2nd, (n+1)*sizeof(uint64_t));
580 if ( !stats->ins_cycles_2nd )
581 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
582 memset(stats->ins_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
584 stats->del_cycles_1st = realloc(stats->del_cycles_1st, (n+1)*sizeof(uint64_t));
585 if ( !stats->del_cycles_1st )
586 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
587 memset(stats->del_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
589 stats->del_cycles_2nd = realloc(stats->del_cycles_2nd, (n+1)*sizeof(uint64_t));
590 if ( !stats->del_cycles_2nd )
591 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
592 memset(stats->del_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
596 // Realloc the coverage distribution buffer
597 int *rbuffer = calloc(sizeof(int),seq_len*5);
598 n = stats->cov_rbuf.size-stats->cov_rbuf.start;
599 memcpy(rbuffer,stats->cov_rbuf.buffer+stats->cov_rbuf.start,n);
600 if ( stats->cov_rbuf.start>1 )
601 memcpy(rbuffer+n,stats->cov_rbuf.buffer,stats->cov_rbuf.start);
602 stats->cov_rbuf.start = 0;
603 free(stats->cov_rbuf.buffer);
604 stats->cov_rbuf.buffer = rbuffer;
605 stats->cov_rbuf.size = seq_len*5;
607 realloc_rseq_buffer(stats);
610 void collect_stats(bam1_t *bam_line, stats_t *stats)
612 if ( stats->rg_hash )
614 const uint8_t *rg = bam_aux_get(bam_line, "RG");
616 khiter_t k = kh_get(kh_rg, stats->rg_hash, (const char*)(rg + 1));
617 if ( k == kh_end(stats->rg_hash) ) return;
619 if ( stats->flag_require && (bam_line->core.flag & stats->flag_require)!=stats->flag_require )
621 if ( stats->flag_filter && (bam_line->core.flag & stats->flag_filter) )
624 if ( !is_in_regions(bam_line,stats) )
627 int seq_len = bam_line->core.l_qseq;
628 if ( !seq_len ) return;
629 if ( stats->filter_readlen!=-1 && seq_len!=stats->filter_readlen ) return;
630 if ( seq_len >= stats->nbases )
631 realloc_buffers(stats,seq_len);
632 if ( stats->max_len<seq_len )
633 stats->max_len = seq_len;
635 stats->read_lengths[seq_len]++;
637 // Count GC and ACGT per cycle
638 uint8_t base, *seq = bam1_seq(bam_line);
641 int reverse = IS_REVERSE(bam_line);
642 for (i=0; i<seq_len; i++)
644 // Conversion from uint8_t coding to ACGT
648 base = bam1_seqi(seq,i);
650 if ( base==1 || base==2 ) gc_count++;
651 else if ( base>2 ) base=3;
652 if ( 4*(reverse ? seq_len-i-1 : i) + base >= stats->nbases*4 )
653 error("FIXME: acgt_cycles\n");
654 stats->acgt_cycles[ 4*(reverse ? seq_len-i-1 : i) + base ]++;
656 int gc_idx_min = gc_count*(stats->ngc-1)/seq_len;
657 int gc_idx_max = (gc_count+1)*(stats->ngc-1)/seq_len;
658 if ( gc_idx_max >= stats->ngc ) gc_idx_max = stats->ngc - 1;
660 // Determine which array (1st or 2nd read) will these stats go to,
661 // trim low quality bases from end the same way BWA does,
664 uint8_t *bam_quals = bam1_qual(bam_line);
665 if ( bam_line->core.flag&BAM_FREAD2 )
667 quals = stats->quals_2nd;
669 for (i=gc_idx_min; i<gc_idx_max; i++)
674 quals = stats->quals_1st;
676 for (i=gc_idx_min; i<gc_idx_max; i++)
679 if ( stats->trim_qual>0 )
680 stats->nbases_trimmed += bwa_trim_read(stats->trim_qual, bam_quals, seq_len, reverse);
682 // Quality histogram and average quality
683 for (i=0; i<seq_len; i++)
685 uint8_t qual = bam_quals[ reverse ? seq_len-i-1 : i];
686 if ( qual>=stats->nquals )
687 error("TODO: quality too high %d>=%d\n", quals[i],stats->nquals);
688 if ( qual>stats->max_qual )
689 stats->max_qual = qual;
691 quals[ i*stats->nquals+qual ]++;
692 stats->sum_qual += qual;
695 // Look at the flags and increment appropriate counters (mapped, paired, etc)
696 if ( IS_UNMAPPED(bam_line) )
697 stats->nreads_unmapped++;
700 if ( !bam_line->core.qual )
703 count_indels(stats,bam_line);
705 // The insert size is tricky, because for long inserts the libraries are
706 // prepared differently and the pairs point in other direction. BWA does
707 // not set the paired flag for them. Similar thing is true also for 454
708 // reads. Therefore, do the insert size stats for all mapped reads.
709 int32_t isize = bam_line->core.isize;
710 if ( isize<0 ) isize = -isize;
711 if ( IS_PAIRED(bam_line) && isize!=0 )
713 stats->nreads_paired++;
714 if ( isize >= stats->nisize )
715 isize=stats->nisize-1;
717 int pos_fst = bam_line->core.mpos - bam_line->core.pos;
718 int is_fst = IS_READ1(bam_line) ? 1 : -1;
719 int is_fwd = IS_REVERSE(bam_line) ? -1 : 1;
720 int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1;
722 if ( is_fwd*is_mfwd>0 )
723 stats->isize_other[isize]++;
724 else if ( is_fst*pos_fst>0 )
726 if ( is_fst*is_fwd>0 )
727 stats->isize_inward[isize]++;
729 stats->isize_outward[isize]++;
731 else if ( is_fst*pos_fst<0 )
733 if ( is_fst*is_fwd>0 )
734 stats->isize_outward[isize]++;
736 stats->isize_inward[isize]++;
740 stats->nreads_unpaired++;
742 // Number of mismatches
743 uint8_t *nm = bam_aux_get(bam_line,"NM");
745 stats->nmismatches += bam_aux2i(nm);
747 // Number of mapped bases from cigar
748 // Conversion from uint32_t to MIDNSHP
751 if ( bam_line->core.n_cigar == 0)
752 error("FIXME: mapped read with no cigar?\n");
754 if ( stats->regions )
756 // Count only on-target bases
757 int iref = bam_line->core.pos + 1;
758 for (i=0; i<bam_line->core.n_cigar; i++)
760 int cig = bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK;
761 int ncig = bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
762 if ( cig==2 ) readlen += ncig;
765 if ( iref < stats->reg_from ) ncig -= stats->reg_from-iref;
766 else if ( iref+ncig-1 > stats->reg_to ) ncig -= iref+ncig-1 - stats->reg_to;
767 if ( ncig<0 ) ncig = 0;
768 stats->nbases_mapped_cigar += ncig;
769 iref += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
774 if ( iref>=stats->reg_from && iref<=stats->reg_to )
775 stats->nbases_mapped_cigar += ncig;
781 // Count the whole read
782 for (i=0; i<bam_line->core.n_cigar; i++)
784 if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==0 || (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==1 )
785 stats->nbases_mapped_cigar += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
786 if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==2 )
787 readlen += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
790 stats->nbases_mapped += seq_len;
792 if ( stats->tid==bam_line->core.tid && bam_line->core.pos<stats->pos )
793 stats->is_sorted = 0;
794 stats->pos = bam_line->core.pos;
796 if ( stats->is_sorted )
798 if ( stats->tid==-1 || stats->tid!=bam_line->core.tid )
799 round_buffer_flush(stats,-1);
801 // Mismatches per cycle and GC-depth graph. For simplicity, reads overlapping GCD bins
802 // are not splitted which results in up to seq_len-1 overlaps. The default bin size is
803 // 20kbp, so the effect is negligible.
806 int inc_ref = 0, inc_gcd = 0;
807 // First pass or new chromosome
808 if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid ) { inc_ref=1; inc_gcd=1; }
809 // Read goes beyond the end of the rseq buffer
810 else if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+readlen ) { inc_ref=1; inc_gcd=1; }
811 // Read overlaps the next gcd bin
812 else if ( stats->gcd_pos+stats->gcd_bin_size < bam_line->core.pos+readlen )
815 if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+stats->gcd_bin_size ) inc_ref = 1;
820 if ( stats->igcd >= stats->ngcd )
821 realloc_gcd_buffer(stats, readlen);
823 read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos);
824 stats->gcd_pos = bam_line->core.pos;
825 stats->gcd[ stats->igcd ].gc = fai_gc_content(stats, stats->gcd_pos, stats->gcd_bin_size);
828 count_mismatches_per_cycle(stats,bam_line);
830 // No reference and first pass, new chromosome or sequence going beyond the end of the gcd bin
831 else if ( stats->gcd_pos==-1 || stats->tid != bam_line->core.tid || bam_line->core.pos - stats->gcd_pos > stats->gcd_bin_size )
833 // First pass or a new chromosome
834 stats->tid = bam_line->core.tid;
835 stats->gcd_pos = bam_line->core.pos;
837 if ( stats->igcd >= stats->ngcd )
838 realloc_gcd_buffer(stats, readlen);
840 stats->gcd[ stats->igcd ].depth++;
841 // When no reference sequence is given, approximate the GC from the read (much shorter window, but otherwise OK)
843 stats->gcd[ stats->igcd ].gc += (float) gc_count / seq_len;
845 // Coverage distribution graph
846 round_buffer_flush(stats,bam_line->core.pos);
847 round_buffer_insert_read(&(stats->cov_rbuf),bam_line->core.pos,bam_line->core.pos+seq_len-1);
851 stats->total_len += seq_len;
852 if ( IS_DUP(bam_line) )
854 stats->total_len_dup += seq_len;
859 // Sort by GC and depth
860 #define GCD_t(x) ((gc_depth_t *)x)
861 static int gcd_cmp(const void *a, const void *b)
863 if ( GCD_t(a)->gc < GCD_t(b)->gc ) return -1;
864 if ( GCD_t(a)->gc > GCD_t(b)->gc ) return 1;
865 if ( GCD_t(a)->depth < GCD_t(b)->depth ) return -1;
866 if ( GCD_t(a)->depth > GCD_t(b)->depth ) return 1;
871 float gcd_percentile(gc_depth_t *gcd, int N, int p)
881 return gcd[N-1].depth;
884 return gcd[k-1].depth + d*(gcd[k].depth - gcd[k-1].depth);
887 void output_stats(stats_t *stats)
889 // Calculate average insert size and standard deviation (from the main bulk data only)
891 uint64_t nisize=0, nisize_inward=0, nisize_outward=0, nisize_other=0;
892 for (isize=1; isize<stats->nisize; isize++)
894 // Each pair was counted twice
895 stats->isize_inward[isize] *= 0.5;
896 stats->isize_outward[isize] *= 0.5;
897 stats->isize_other[isize] *= 0.5;
899 nisize_inward += stats->isize_inward[isize];
900 nisize_outward += stats->isize_outward[isize];
901 nisize_other += stats->isize_other[isize];
902 nisize += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
905 double bulk=0, avg_isize=0, sd_isize=0;
906 for (isize=1; isize<stats->nisize; isize++)
908 bulk += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
909 avg_isize += isize * (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]);
911 if ( bulk/nisize > stats->isize_main_bulk )
918 avg_isize /= nisize ? nisize : 1;
919 for (isize=1; isize<ibulk; isize++)
920 sd_isize += (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]) * (isize-avg_isize)*(isize-avg_isize) / nisize;
921 sd_isize = sqrt(sd_isize);
924 printf("# This file was produced by bamcheck (%s)\n",BAMCHECK_VERSION);
925 printf("# The command line was: %s",stats->argv[0]);
927 for (i=1; i<stats->argc; i++)
928 printf(" %s",stats->argv[i]);
930 printf("# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n");
931 printf("SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd));
932 printf("SN\tis paired:\t%d\n", stats->nreads_1st&&stats->nreads_2nd ? 1 : 0);
933 printf("SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0);
934 printf("SN\t1st fragments:\t%ld\n", (long)stats->nreads_1st);
935 printf("SN\tlast fragments:\t%ld\n", (long)stats->nreads_2nd);
936 printf("SN\treads mapped:\t%ld\n", (long)(stats->nreads_paired+stats->nreads_unpaired));
937 printf("SN\treads unmapped:\t%ld\n", (long)stats->nreads_unmapped);
938 printf("SN\treads unpaired:\t%ld\n", (long)stats->nreads_unpaired);
939 printf("SN\treads paired:\t%ld\n", (long)stats->nreads_paired);
940 printf("SN\treads duplicated:\t%ld\n", (long)stats->nreads_dup);
941 printf("SN\treads MQ0:\t%ld\n", (long)stats->nreads_mq0);
942 printf("SN\ttotal length:\t%ld\n", (long)stats->total_len);
943 printf("SN\tbases mapped:\t%ld\n", (long)stats->nbases_mapped);
944 printf("SN\tbases mapped (cigar):\t%ld\n", (long)stats->nbases_mapped_cigar);
945 printf("SN\tbases trimmed:\t%ld\n", (long)stats->nbases_trimmed);
946 printf("SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup);
947 printf("SN\tmismatches:\t%ld\n", (long)stats->nmismatches);
948 printf("SN\terror rate:\t%e\n", (float)stats->nmismatches/stats->nbases_mapped_cigar);
949 float avg_read_length = (stats->nreads_1st+stats->nreads_2nd)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd):0;
950 printf("SN\taverage length:\t%.0f\n", avg_read_length);
951 printf("SN\tmaximum length:\t%d\n", stats->max_len);
952 printf("SN\taverage quality:\t%.1f\n", stats->total_len?stats->sum_qual/stats->total_len:0);
953 printf("SN\tinsert size average:\t%.1f\n", avg_isize);
954 printf("SN\tinsert size standard deviation:\t%.1f\n", sd_isize);
955 printf("SN\tinward oriented pairs:\t%ld\n", (long)nisize_inward);
956 printf("SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward);
957 printf("SN\tpairs with other orientation:\t%ld\n", (long)nisize_other);
960 if ( stats->max_len<stats->nbases ) stats->max_len++;
961 if ( stats->max_qual+1<stats->nquals ) stats->max_qual++;
962 printf("# First Fragment Qualitites. Use `grep ^FFQ | cut -f 2-` to extract this part.\n");
963 printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
964 for (ibase=0; ibase<stats->max_len; ibase++)
966 printf("FFQ\t%d",ibase+1);
967 for (iqual=0; iqual<=stats->max_qual; iqual++)
969 printf("\t%ld", (long)stats->quals_1st[ibase*stats->nquals+iqual]);
973 printf("# Last Fragment Qualitites. Use `grep ^LFQ | cut -f 2-` to extract this part.\n");
974 printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
975 for (ibase=0; ibase<stats->max_len; ibase++)
977 printf("LFQ\t%d",ibase+1);
978 for (iqual=0; iqual<=stats->max_qual; iqual++)
980 printf("\t%ld", (long)stats->quals_2nd[ibase*stats->nquals+iqual]);
984 if ( stats->mpc_buf )
986 printf("# Mismatches per cycle and quality. Use `grep ^MPC | cut -f 2-` to extract this part.\n");
987 printf("# Columns correspond to qualities, rows to cycles. First column is the cycle number, second\n");
988 printf("# is the number of N's and the rest is the number of mismatches\n");
989 for (ibase=0; ibase<stats->max_len; ibase++)
991 printf("MPC\t%d",ibase+1);
992 for (iqual=0; iqual<=stats->max_qual; iqual++)
994 printf("\t%ld", (long)stats->mpc_buf[ibase*stats->nquals+iqual]);
999 printf("# GC Content of first fragments. Use `grep ^GCF | cut -f 2-` to extract this part.\n");
1001 for (ibase=0; ibase<stats->ngc; ibase++)
1003 if ( stats->gc_1st[ibase]==stats->gc_1st[ibase_prev] ) continue;
1004 printf("GCF\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_1st[ibase_prev]);
1007 printf("# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part.\n");
1009 for (ibase=0; ibase<stats->ngc; ibase++)
1011 if ( stats->gc_2nd[ibase]==stats->gc_2nd[ibase_prev] ) continue;
1012 printf("GCL\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_2nd[ibase_prev]);
1015 printf("# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle, and A,C,G,T counts [%%]\n");
1016 for (ibase=0; ibase<stats->max_len; ibase++)
1018 uint64_t *ptr = &(stats->acgt_cycles[ibase*4]);
1019 uint64_t sum = ptr[0]+ptr[1]+ptr[2]+ptr[3];
1020 if ( ! sum ) continue;
1021 printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum);
1023 printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n");
1024 for (isize=1; isize<ibulk; isize++)
1025 printf("IS\t%d\t%ld\t%ld\t%ld\t%ld\n", isize, (long)(stats->isize_inward[isize]+stats->isize_outward[isize]+stats->isize_other[isize]),
1026 (long)stats->isize_inward[isize], (long)stats->isize_outward[isize], (long)stats->isize_other[isize]);
1028 printf("# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count\n");
1030 for (ilen=0; ilen<stats->max_len; ilen++)
1032 if ( stats->read_lengths[ilen]>0 )
1033 printf("RL\t%d\t%ld\n", ilen, (long)stats->read_lengths[ilen]);
1036 printf("# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n");
1037 for (ilen=0; ilen<stats->nindels; ilen++)
1039 if ( stats->insertions[ilen]>0 || stats->deletions[ilen]>0 )
1040 printf("ID\t%d\t%ld\t%ld\n", ilen+1, (long)stats->insertions[ilen], (long)stats->deletions[ilen]);
1043 printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev)\n");
1044 for (ilen=0; ilen<=stats->nbases; ilen++)
1046 // For deletions we print the index of the cycle before the deleted base (1-based) and for insertions
1047 // the index of the cycle of the first inserted base (also 1-based)
1048 if ( stats->ins_cycles_1st[ilen]>0 || stats->ins_cycles_2nd[ilen]>0 || stats->del_cycles_1st[ilen]>0 || stats->del_cycles_2nd[ilen]>0 )
1049 printf("IC\t%d\t%ld\t%ld\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles_1st[ilen], (long)stats->ins_cycles_2nd[ilen], (long)stats->del_cycles_1st[ilen], (long)stats->del_cycles_2nd[ilen]);
1052 printf("# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n");
1053 if ( stats->cov[0] )
1054 printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1, (long)stats->cov[0]);
1056 for (icov=1; icov<stats->ncov-1; icov++)
1057 if ( stats->cov[icov] )
1058 printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1, (long)stats->cov[icov]);
1059 if ( stats->cov[stats->ncov-1] )
1060 printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1, (long)stats->cov[stats->ncov-1]);
1062 // Calculate average GC content, then sort by GC and depth
1063 printf("# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n");
1065 for (igcd=0; igcd<stats->igcd; igcd++)
1068 stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc);
1070 if ( stats->gcd[igcd].depth )
1071 stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc / stats->gcd[igcd].depth);
1073 qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp);
1075 while ( igcd < stats->igcd )
1077 // Calculate percentiles (10,25,50,75,90th) for the current GC content and print
1078 uint32_t nbins=0, itmp=igcd;
1079 float gc = stats->gcd[igcd].gc;
1080 while ( itmp<stats->igcd && fabs(stats->gcd[itmp].gc-gc)<0.1 )
1085 printf("GCD\t%.1f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", gc, (igcd+nbins+1)*100./(stats->igcd+1),
1086 gcd_percentile(&(stats->gcd[igcd]),nbins,10) *avg_read_length/stats->gcd_bin_size,
1087 gcd_percentile(&(stats->gcd[igcd]),nbins,25) *avg_read_length/stats->gcd_bin_size,
1088 gcd_percentile(&(stats->gcd[igcd]),nbins,50) *avg_read_length/stats->gcd_bin_size,
1089 gcd_percentile(&(stats->gcd[igcd]),nbins,75) *avg_read_length/stats->gcd_bin_size,
1090 gcd_percentile(&(stats->gcd[igcd]),nbins,90) *avg_read_length/stats->gcd_bin_size
1096 size_t mygetline(char **line, size_t *n, FILE *fp)
1098 if (line == NULL || n == NULL || fp == NULL)
1103 if (*n==0 || !*line)
1111 while ((c=getc(fp))!= EOF && c!='\n')
1116 *line = realloc(*line, sizeof(char)*(*n));
1118 (*line)[nread-1] = c;
1123 *line = realloc(*line, sizeof(char)*(*n));
1126 return nread>0 ? nread : -1;
1130 void init_regions(stats_t *stats, char *file)
1133 khash_t(kh_bam_tid) *header_hash;
1135 bam_init_header_hash(stats->sam->header);
1136 header_hash = (khash_t(kh_bam_tid)*)stats->sam->header->hash;
1138 FILE *fp = fopen(file,"r");
1139 if ( !fp ) error("%s: %s\n",file,strerror(errno));
1145 int prev_tid=-1, prev_pos=-1;
1146 while ((nread = mygetline(&line, &len, fp)) != -1)
1148 if ( line[0] == '#' ) continue;
1151 while ( i<nread && !isspace(line[i]) ) i++;
1152 if ( i>=nread ) error("Could not parse the file: %s [%s]\n", file,line);
1155 iter = kh_get(kh_bam_tid, header_hash, line);
1156 int tid = kh_val(header_hash, iter);
1157 if ( iter == kh_end(header_hash) )
1160 fprintf(stderr,"Warning: Some sequences not present in the BAM, e.g. \"%s\". This message is printed only once.\n", line);
1165 if ( tid >= stats->nregions )
1167 stats->regions = realloc(stats->regions,sizeof(regions_t)*(stats->nregions+100));
1169 for (j=stats->nregions; j<stats->nregions+100; j++)
1171 stats->regions[j].npos = stats->regions[j].mpos = stats->regions[j].cpos = 0;
1172 stats->regions[j].pos = NULL;
1174 stats->nregions += 100;
1176 int npos = stats->regions[tid].npos;
1177 if ( npos >= stats->regions[tid].mpos )
1179 stats->regions[tid].mpos += 1000;
1180 stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos);
1183 if ( (sscanf(line+i+1,"%d %d",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n");
1184 if ( prev_tid==-1 || prev_tid!=tid )
1187 prev_pos = stats->regions[tid].pos[npos].from;
1189 if ( prev_pos>stats->regions[tid].pos[npos].from )
1190 error("The positions are not in chromosomal order (%s:%d comes after %d)\n", line,stats->regions[tid].pos[npos].from,prev_pos);
1191 stats->regions[tid].npos++;
1193 if (line) free(line);
1194 if ( !stats->regions ) error("Unable to map the -t sequences to the BAM sequences.\n");
1198 void destroy_regions(stats_t *stats)
1201 for (i=0; i<stats->nregions; i++)
1203 if ( !stats->regions[i].mpos ) continue;
1204 free(stats->regions[i].pos);
1206 if ( stats->regions ) free(stats->regions);
1209 static int fetch_read(const bam1_t *bam_line, void *data)
1211 collect_stats((bam1_t*)bam_line,(stats_t*)data);
1215 void reset_regions(stats_t *stats)
1218 for (i=0; i<stats->nregions; i++)
1219 stats->regions[i].cpos = 0;
1222 int is_in_regions(bam1_t *bam_line, stats_t *stats)
1224 if ( !stats->regions ) return 1;
1226 if ( bam_line->core.tid >= stats->nregions || bam_line->core.tid<0 ) return 0;
1227 if ( !stats->is_sorted ) error("The BAM must be sorted in order for -t to work.\n");
1229 regions_t *reg = &stats->regions[bam_line->core.tid];
1230 if ( reg->cpos==reg->npos ) return 0; // done for this chr
1232 // Find a matching interval or skip this read. No splicing of reads is done, no indels or soft clips considered,
1233 // even small overlap is enough to include the read in the stats.
1235 while ( i<reg->npos && reg->pos[i].to<=bam_line->core.pos ) i++;
1236 if ( i>=reg->npos ) { reg->cpos = reg->npos; return 0; }
1237 if ( bam_line->core.pos + bam_line->core.l_qseq + 1 < reg->pos[i].from ) return 0;
1239 stats->reg_from = reg->pos[i].from;
1240 stats->reg_to = reg->pos[i].to;
1245 void init_group_id(stats_t *stats, char *id)
1247 if ( !stats->sam->header->dict )
1248 stats->sam->header->dict = sam_header_parse2(stats->sam->header->text);
1249 void *iter = stats->sam->header->dict;
1250 const char *key, *val;
1252 stats->rg_hash = kh_init(kh_rg);
1253 while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) )
1255 if ( !strcmp(id,key) || (val && !strcmp(id,val)) )
1257 khiter_t k = kh_get(kh_rg, stats->rg_hash, key);
1258 if ( k != kh_end(stats->rg_hash) )
1259 fprintf(stderr, "[init_group_id] The group ID not unique: \"%s\"\n", key);
1261 k = kh_put(kh_rg, stats->rg_hash, key, &ret);
1262 kh_value(stats->rg_hash, k) = val;
1267 error("The sample or read group \"%s\" not present.\n", id);
1271 void error(const char *format, ...)
1275 printf("Version: %s\n", BAMCHECK_VERSION);
1276 printf("About: The program collects statistics from BAM files. The output can be visualized using plot-bamcheck.\n");
1277 printf("Usage: bamcheck [OPTIONS] file.bam\n");
1278 printf(" bamcheck [OPTIONS] file.bam chr:from-to\n");
1279 printf("Options:\n");
1280 printf(" -c, --coverage <int>,<int>,<int> Coverage distribution min,max,step [1,1000,1]\n");
1281 printf(" -d, --remove-dups Exlude from statistics reads marked as duplicates\n");
1282 printf(" -f, --required-flag <int> Required flag, 0 for unset [0]\n");
1283 printf(" -F, --filtering-flag <int> Filtering flag, 0 for unset [0]\n");
1284 printf(" --GC-depth <float,float> Bin size for GC-depth graph and the maximum reference length [2e4,4.2e9]\n");
1285 printf(" -h, --help This help message\n");
1286 printf(" -i, --insert-size <int> Maximum insert size [8000]\n");
1287 printf(" -I, --id <string> Include only listed read group or sample name\n");
1288 printf(" -l, --read-length <int> Include in the statistics only reads with the given read length []\n");
1289 printf(" -m, --most-inserts <float> Report only the main part of inserts [0.99]\n");
1290 printf(" -q, --trim-quality <int> The BWA trimming parameter [0]\n");
1291 printf(" -r, --ref-seq <file> Reference sequence (required for GC-depth calculation).\n");
1292 printf(" -t, --target-regions <file> Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n");
1293 printf(" -s, --sam Input is SAM\n");
1299 va_start(ap, format);
1300 vfprintf(stderr, format, ap);
1306 int main(int argc, char *argv[])
1308 char *targets = NULL;
1309 char *bam_fname = NULL;
1310 char *group_id = NULL;
1311 samfile_t *sam = NULL;
1314 stats_t *stats = calloc(1,sizeof(stats_t));
1317 stats->nbases = 300;
1318 stats->nisize = 8000;
1319 stats->max_len = 30;
1320 stats->max_qual = 40;
1321 stats->isize_main_bulk = 0.99; // There are always outliers at the far end
1322 stats->gcd_bin_size = 20e3;
1323 stats->gcd_ref_size = 4.2e9;
1324 stats->rseq_pos = -1;
1325 stats->tid = stats->gcd_pos = stats->igcd = -1;
1326 stats->is_sorted = 1;
1328 stats->cov_max = 1000;
1329 stats->cov_step = 1;
1332 stats->filter_readlen = -1;
1333 stats->nindels = stats->nbases;
1335 strcpy(in_mode, "rb");
1337 static struct option loptions[] =
1340 {"remove-dups",0,0,'d'},
1342 {"ref-seq",1,0,'r'},
1343 {"coverage",1,0,'c'},
1344 {"read-length",1,0,'l'},
1345 {"insert-size",1,0,'i'},
1346 {"most-inserts",1,0,'m'},
1347 {"trim-quality",1,0,'q'},
1348 {"target-regions",0,0,'t'},
1349 {"required-flag",1,0,'f'},
1350 {"filtering-flag",0,0,'F'},
1356 while ( (opt=getopt_long(argc,argv,"?hdsr:c:l:i:t:m:q:f:F:I:1:",loptions,NULL))>0 )
1360 case 'f': stats->flag_require=strtol(optarg,0,0); break;
1361 case 'F': stats->flag_filter=strtol(optarg,0,0); break;
1362 case 'd': stats->flag_filter|=BAM_FDUP; break;
1363 case 's': strcpy(in_mode, "r"); break;
1364 case 'r': stats->fai = fai_load(optarg);
1366 error("Could not load faidx: %s\n", optarg);
1370 if ( sscanf(optarg,"%f,%f",&fbin,&flen)!= 2 )
1371 error("Unable to parse --GC-depth %s\n", optarg);
1372 stats->gcd_bin_size = fbin;
1373 stats->gcd_ref_size = flen;
1376 case 'c': if ( sscanf(optarg,"%d,%d,%d",&stats->cov_min,&stats->cov_max,&stats->cov_step)!= 3 )
1377 error("Unable to parse -c %s\n", optarg);
1379 case 'l': stats->filter_readlen = atoi(optarg); break;
1380 case 'i': stats->nisize = atoi(optarg); break;
1381 case 'm': stats->isize_main_bulk = atof(optarg); break;
1382 case 'q': stats->trim_qual = atoi(optarg); break;
1383 case 't': targets = optarg; break;
1384 case 'I': group_id = optarg; break;
1386 case 'h': error(NULL);
1387 default: error("Unknown argument: %s\n", optarg);
1391 bam_fname = argv[optind++];
1395 if ( isatty(fileno((FILE *)stdin)) )
1401 // .. coverage bins and round buffer
1402 if ( stats->cov_step > stats->cov_max - stats->cov_min + 1 )
1404 stats->cov_step = stats->cov_max - stats->cov_min;
1405 if ( stats->cov_step <= 0 )
1406 stats->cov_step = 1;
1408 stats->ncov = 3 + (stats->cov_max-stats->cov_min) / stats->cov_step;
1409 stats->cov_max = stats->cov_min + ((stats->cov_max-stats->cov_min)/stats->cov_step +1)*stats->cov_step - 1;
1410 stats->cov = calloc(sizeof(uint64_t),stats->ncov);
1411 stats->cov_rbuf.size = stats->nbases*5;
1412 stats->cov_rbuf.buffer = calloc(sizeof(int32_t),stats->cov_rbuf.size);
1414 if ((sam = samopen(bam_fname, in_mode, NULL)) == 0)
1415 error("Failed to open: %s\n", bam_fname);
1417 if ( group_id ) init_group_id(stats, group_id);
1418 bam1_t *bam_line = bam_init1();
1420 stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
1421 stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
1422 stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t));
1423 stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t));
1424 stats->isize_inward = calloc(stats->nisize,sizeof(uint64_t));
1425 stats->isize_outward = calloc(stats->nisize,sizeof(uint64_t));
1426 stats->isize_other = calloc(stats->nisize,sizeof(uint64_t));
1427 stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t));
1428 stats->mpc_buf = stats->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL;
1429 stats->acgt_cycles = calloc(4*stats->nbases,sizeof(uint64_t));
1430 stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t));
1431 stats->insertions = calloc(stats->nbases,sizeof(uint64_t));
1432 stats->deletions = calloc(stats->nbases,sizeof(uint64_t));
1433 stats->ins_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
1434 stats->ins_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
1435 stats->del_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
1436 stats->del_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
1437 realloc_rseq_buffer(stats);
1439 init_regions(stats, targets);
1441 // Collect statistics
1444 // Collect stats in selected regions only
1445 bam_index_t *bam_idx = bam_index_load(bam_fname);
1447 error("Random alignment retrieval only works for indexed BAM files.\n");
1450 for (i=optind; i<argc; i++)
1453 bam_parse_region(stats->sam->header, argv[i], &tid, &beg, &end);
1454 if ( tid < 0 ) continue;
1455 reset_regions(stats);
1456 bam_fetch(stats->sam->x.bam, bam_idx, tid, beg, end, stats, fetch_read);
1458 bam_index_destroy(bam_idx);
1462 // Stream through the entire BAM ignoring off-target regions if -t is given
1463 while (samread(sam,bam_line) >= 0)
1464 collect_stats(bam_line,stats);
1466 round_buffer_flush(stats,-1);
1468 output_stats(stats);
1470 bam_destroy1(bam_line);
1471 samclose(stats->sam);
1472 if (stats->fai) fai_destroy(stats->fai);
1473 free(stats->cov_rbuf.buffer); free(stats->cov);
1474 free(stats->quals_1st); free(stats->quals_2nd);
1475 free(stats->gc_1st); free(stats->gc_2nd);
1476 free(stats->isize_inward); free(stats->isize_outward); free(stats->isize_other);
1478 free(stats->rseq_buf);
1479 free(stats->mpc_buf);
1480 free(stats->acgt_cycles);
1481 free(stats->read_lengths);
1482 free(stats->insertions);
1483 free(stats->deletions);
1484 free(stats->ins_cycles_1st);
1485 free(stats->ins_cycles_2nd);
1486 free(stats->del_cycles_1st);
1487 free(stats->del_cycles_2nd);
1488 destroy_regions(stats);
1490 if ( stats->rg_hash ) kh_destroy(kh_rg, stats->rg_hash);