2 Author: petr.danecek@sanger
3 gcc -Wall -Winline -g -O2 -I ~/git/samtools bamcheck.c -o bamcheck -lm -lz -L ~/git/samtools -lbam -lpthread
5 Assumptions, approximations and other issues:
6 - GC-depth graph does not split reads, the starting position determines which bin is incremented.
7 There are small overlaps between bins (max readlen-1). However, the bins are big (20k).
8 - coverage distribution ignores softclips and deletions
9 - some stats require sorted BAMs
10 - GC content graph can have an untidy, step-like pattern when BAM contains multiple read lengths.
11 - 'bases mapped' (stats->nbases_mapped) is calculated from read lengths given by BAM (core.l_qseq)
12 - With the -t option, the whole reads are used. Except for the number of mapped bases (cigar)
13 counts, no splicing is done, no indels or soft clips are considered, even small overlap is
14 good enough to include the read in the stats.
18 #define BAMCHECK_VERSION "2012-09-04"
20 #define _ISOC99_SOURCE
33 #include "sam_header.h"
36 #define BWA_MIN_RDLEN 35
37 #define IS_PAIRED(bam) ((bam)->core.flag&BAM_FPAIRED && !((bam)->core.flag&BAM_FUNMAP) && !((bam)->core.flag&BAM_FMUNMAP))
38 #define IS_UNMAPPED(bam) ((bam)->core.flag&BAM_FUNMAP)
39 #define IS_REVERSE(bam) ((bam)->core.flag&BAM_FREVERSE)
40 #define IS_MATE_REVERSE(bam) ((bam)->core.flag&BAM_FMREVERSE)
41 #define IS_READ1(bam) ((bam)->core.flag&BAM_FREAD1)
42 #define IS_READ2(bam) ((bam)->core.flag&BAM_FREAD2)
43 #define IS_DUP(bam) ((bam)->core.flag&BAM_FDUP)
47 int32_t line_len, line_blen;
52 KHASH_MAP_INIT_STR(kh_faidx, faidx1_t)
53 KHASH_MAP_INIT_STR(kh_bam_tid, int)
54 KHASH_MAP_INIT_STR(kh_rg, const char *)
59 khash_t(kh_faidx) *hash;
69 // For coverage distribution, a simple pileup
78 typedef struct { uint32_t from, to; } pos_t;
89 int trim_qual; // bwa trim quality
91 // Dimensions of the quality histogram holder (quals_1st,quals_2nd), GC content holder (gc_1st,gc_2nd),
92 // insert size histogram holder
93 int nquals; // The number of quality bins
94 int nbases; // The maximum sequence length the allocated array can hold
95 int nisize; // The maximum insert size that the allocated array can hold
96 int ngc; // The size of gc_1st and gc_2nd
97 int nindels; // The maximum indel length for indel distribution
99 // Arrays for the histogram data
100 uint64_t *quals_1st, *quals_2nd;
101 uint64_t *gc_1st, *gc_2nd;
102 uint64_t *isize_inward, *isize_outward, *isize_other;
103 uint64_t *acgt_cycles;
104 uint64_t *read_lengths;
105 uint64_t *insertions, *deletions;
106 uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd;
108 // The extremes encountered
109 int max_len; // Maximum read length
110 int max_qual; // Maximum quality
111 float isize_main_bulk; // There are always some unrealistically big insert sizes, report only the main part
116 uint64_t total_len_dup;
120 uint64_t nreads_unmapped;
121 uint64_t nreads_unpaired;
122 uint64_t nreads_paired;
123 uint64_t nreads_anomalous;
125 uint64_t nbases_mapped;
126 uint64_t nbases_mapped_cigar;
127 uint64_t nbases_trimmed; // bwa trimmed bases
128 uint64_t nmismatches;
129 uint64_t nreads_QCfailed, nreads_secondary;
131 // GC-depth related data
132 uint32_t ngcd, igcd; // The maximum number of GC depth bins and index of the current bin
133 gc_depth_t *gcd; // The GC-depth bins holder
134 int gcd_bin_size; // The size of GC-depth bin
135 uint32_t gcd_ref_size; // The approximate size of the genome
136 int32_t tid, gcd_pos; // Position of the current bin
137 int32_t pos; // Position of the last read
139 // Coverage distribution related data
140 int ncov; // The number of coverage bins
141 uint64_t *cov; // The coverage frequencies
142 int cov_min,cov_max,cov_step; // Minimum, maximum coverage and size of the coverage bins
143 round_buffer_t cov_rbuf; // Pileup round buffer
145 // Mismatches by read cycle
146 uint8_t *rseq_buf; // A buffer for reference sequence to check the mismatches against
147 int mrseq_buf; // The size of the buffer
148 int32_t rseq_pos; // The coordinate of the first base in the buffer
149 int32_t nrseq_buf; // The used part of the buffer
150 uint64_t *mpc_buf; // Mismatches per cycle
156 int nregions, reg_from,reg_to;
160 int flag_require, flag_filter;
161 double sum_qual; // For calculating average quality value
163 khash_t(kh_rg) *rg_hash; // Read groups to include, the array is null-terminated
164 faidx_t *fai; // Reference sequence for GC-depth graph
165 int argc; // Command line arguments to be printed on the output
170 void error(const char *format, ...);
171 void bam_init_header_hash(bam_header_t *header);
172 int is_in_regions(bam1_t *bam_line, stats_t *stats);
175 // Coverage distribution methods
176 inline int coverage_idx(int min, int max, int n, int step, int depth)
184 return 1 + (depth - min) / step;
187 inline int round_buffer_lidx2ridx(int offset, int size, int64_t refpos, int64_t pos)
189 return (offset + (pos-refpos) % size) % size;
192 void round_buffer_flush(stats_t *stats, int64_t pos)
196 if ( pos==stats->cov_rbuf.pos )
199 int64_t new_pos = pos;
200 if ( pos==-1 || pos - stats->cov_rbuf.pos >= stats->cov_rbuf.size )
202 // Flush the whole buffer, but in sequential order,
203 pos = stats->cov_rbuf.pos + stats->cov_rbuf.size - 1;
206 if ( pos < stats->cov_rbuf.pos )
207 error("Expected coordinates in ascending order, got %ld after %ld\n", pos,stats->cov_rbuf.pos);
209 int ifrom = stats->cov_rbuf.start;
210 int ito = round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos-1);
213 for (ibuf=ifrom; ibuf<stats->cov_rbuf.size; ibuf++)
215 if ( !stats->cov_rbuf.buffer[ibuf] )
217 idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
219 stats->cov_rbuf.buffer[ibuf] = 0;
223 for (ibuf=ifrom; ibuf<=ito; ibuf++)
225 if ( !stats->cov_rbuf.buffer[ibuf] )
227 idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
229 stats->cov_rbuf.buffer[ibuf] = 0;
231 stats->cov_rbuf.start = (new_pos==-1) ? 0 : round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos);
232 stats->cov_rbuf.pos = new_pos;
235 void round_buffer_insert_read(round_buffer_t *rbuf, int64_t from, int64_t to)
237 if ( to-from >= rbuf->size )
238 error("The read length too big (%d), please increase the buffer length (currently %d)\n", to-from+1,rbuf->size);
239 if ( from < rbuf->pos )
240 error("The reads are not sorted (%ld comes after %ld).\n", from,rbuf->pos);
243 ifrom = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,from);
244 ito = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,to);
247 for (ibuf=ifrom; ibuf<rbuf->size; ibuf++)
248 rbuf->buffer[ibuf]++;
251 for (ibuf=ifrom; ibuf<=ito; ibuf++)
252 rbuf->buffer[ibuf]++;
255 // Calculate the number of bases in the read trimmed by BWA
256 int bwa_trim_read(int trim_qual, uint8_t *quals, int len, int reverse)
258 if ( len<BWA_MIN_RDLEN ) return 0;
260 // Although the name implies that the read cannot be trimmed to more than BWA_MIN_RDLEN,
261 // the calculation can in fact trim it to (BWA_MIN_RDLEN-1). (bwa_trim_read in bwa/bwaseqio.c).
262 int max_trimmed = len - BWA_MIN_RDLEN + 1;
263 int l, sum=0, max_sum=0, max_l=0;
265 for (l=0; l<max_trimmed; l++)
267 sum += trim_qual - quals[ reverse ? l : len-1-l ];
272 // This is the correct way, but bwa clips from some reason one base less
281 void count_indels(stats_t *stats,bam1_t *bam_line)
283 int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
284 int is_1st = IS_READ1(bam_line) ? 1 : 0;
287 int read_len = bam_line->core.l_qseq;
288 for (icig=0; icig<bam_line->core.n_cigar; icig++)
290 // Conversion from uint32_t to MIDNSHP
293 int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
294 int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
298 int idx = is_fwd ? icycle : read_len-icycle-ncig;
300 error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle);
301 if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%d %s\n", idx,stats->nbases, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
303 stats->ins_cycles_1st[idx]++;
305 stats->ins_cycles_2nd[idx]++;
307 if ( ncig<=stats->nindels )
308 stats->insertions[ncig-1]++;
313 int idx = is_fwd ? icycle-1 : read_len-icycle-1;
314 if ( idx<0 ) continue; // discard meaningless deletions
315 if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases);
317 stats->del_cycles_1st[idx]++;
319 stats->del_cycles_2nd[idx]++;
320 if ( ncig<=stats->nindels )
321 stats->deletions[ncig-1]++;
324 if ( cig!=3 && cig!=5 )
329 void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line)
331 int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
332 int icig,iread=0,icycle=0;
333 int iref = bam_line->core.pos - stats->rseq_pos;
334 int read_len = bam_line->core.l_qseq;
335 uint8_t *read = bam1_seq(bam_line);
336 uint8_t *quals = bam1_qual(bam_line);
337 uint64_t *mpc_buf = stats->mpc_buf;
338 for (icig=0; icig<bam_line->core.n_cigar; icig++)
340 // Conversion from uint32_t to MIDNSHP
343 int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
344 int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
359 // Soft-clips are present in the sequence, but the position of the read marks a start of non-clipped sequence
369 // Ignore H and N CIGARs. The letter are inserted e.g. by TopHat and often require very large
370 // chunk of refseq in memory. Not very frequent and not noticable in the stats.
371 if ( cig==3 || cig==5 ) continue;
373 error("TODO: cigar %d, %s:%d %s\n", cig,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
375 if ( ncig+iref > stats->nrseq_buf )
376 error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->nrseq_buf, bam1_qname(bam_line),stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1);
379 for (im=0; im<ncig; im++)
381 uint8_t cread = bam1_seqi(read,iread);
382 uint8_t cref = stats->rseq_buf[iref];
388 int idx = is_fwd ? icycle : read_len-icycle-1;
389 if ( idx>stats->max_len )
390 error("mpc: %d>%d\n",idx,stats->max_len);
391 idx = idx*stats->nquals;
392 if ( idx>=stats->nquals*stats->nbases )
393 error("FIXME: mpc_buf overflow\n");
396 else if ( cref && cread && cref!=cread )
398 uint8_t qual = quals[iread] + 1;
399 if ( qual>=stats->nquals )
400 error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
402 int idx = is_fwd ? icycle : read_len-icycle-1;
403 if ( idx>stats->max_len )
404 error("mpc: %d>%d\n",idx,stats->max_len);
406 idx = idx*stats->nquals + qual;
407 if ( idx>=stats->nquals*stats->nbases )
408 error("FIXME: mpc_buf overflow\n");
419 void read_ref_seq(stats_t *stats,int32_t tid,int32_t pos)
421 khash_t(kh_faidx) *h;
425 faidx_t *fai = stats->fai;
428 chr = stats->sam->header->target_name[tid];
430 // ID of the sequence name
431 iter = kh_get(kh_faidx, h, chr);
432 if (iter == kh_end(h))
433 error("No such reference sequence [%s]?\n", chr);
434 val = kh_value(h, iter);
436 // Check the boundaries
438 error("Was the bam file mapped with the reference sequence supplied?"
439 " A read mapped beyond the end of the chromosome (%s:%d, chromosome length %d).\n", chr,pos,val.len);
440 int size = stats->mrseq_buf;
441 // The buffer extends beyond the chromosome end. Later the rest will be filled with N's.
442 if (size+pos > val.len) size = val.len-pos;
444 // Position the razf reader
445 razf_seek(fai->rz, val.offset + pos / val.line_blen * val.line_len + pos % val.line_blen, SEEK_SET);
447 uint8_t *ptr = stats->rseq_buf;
449 while ( nread<size && razf_read(fai->rz,&c,1) && !fai->rz->z_err )
454 // Conversion between uint8_t coding and ACGT
457 if ( c=='A' || c=='a' )
459 else if ( c=='C' || c=='c' )
461 else if ( c=='G' || c=='g' )
463 else if ( c=='T' || c=='t' )
470 if ( nread < stats->mrseq_buf )
472 memset(ptr,0, stats->mrseq_buf - nread);
473 nread = stats->mrseq_buf;
475 stats->nrseq_buf = nread;
476 stats->rseq_pos = pos;
480 float fai_gc_content(stats_t *stats, int pos, int len)
483 int i = pos - stats->rseq_pos, ito = i + len;
484 assert( i>=0 && ito<=stats->nrseq_buf );
490 c = stats->rseq_buf[i];
496 else if ( c==1 || c==8 )
499 return count ? (float)gc/count : 0;
502 void realloc_rseq_buffer(stats_t *stats)
504 int n = stats->nbases*10;
505 if ( stats->gcd_bin_size > n ) n = stats->gcd_bin_size;
506 if ( stats->mrseq_buf<n )
508 stats->rseq_buf = realloc(stats->rseq_buf,sizeof(uint8_t)*n);
509 stats->mrseq_buf = n;
513 void realloc_gcd_buffer(stats_t *stats, int seq_len)
515 if ( seq_len >= stats->gcd_bin_size )
516 error("The --GC-depth bin size (%d) is set too low for the read length %d\n", stats->gcd_bin_size, seq_len);
518 int n = 1 + stats->gcd_ref_size / (stats->gcd_bin_size - seq_len);
519 if ( n <= stats->igcd )
520 error("The --GC-depth bin size is too small or reference genome too big; please decrease the bin size or increase the reference length\n");
522 if ( n > stats->ngcd )
524 stats->gcd = realloc(stats->gcd, n*sizeof(gc_depth_t));
526 error("Could not realloc GCD buffer, too many chromosomes or the genome too long?? [%u %u]\n", stats->ngcd,n);
527 memset(&(stats->gcd[stats->ngcd]),0,(n-stats->ngcd)*sizeof(gc_depth_t));
531 realloc_rseq_buffer(stats);
534 void realloc_buffers(stats_t *stats, int seq_len)
536 int n = 2*(1 + seq_len - stats->nbases) + stats->nbases;
538 stats->quals_1st = realloc(stats->quals_1st, n*stats->nquals*sizeof(uint64_t));
539 if ( !stats->quals_1st )
540 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
541 memset(stats->quals_1st + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
543 stats->quals_2nd = realloc(stats->quals_2nd, n*stats->nquals*sizeof(uint64_t));
544 if ( !stats->quals_2nd )
545 error("Could not realloc buffers, the sequence too long: %d (2x%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
546 memset(stats->quals_2nd + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
548 if ( stats->mpc_buf )
550 stats->mpc_buf = realloc(stats->mpc_buf, n*stats->nquals*sizeof(uint64_t));
551 if ( !stats->mpc_buf )
552 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
553 memset(stats->mpc_buf + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
556 stats->acgt_cycles = realloc(stats->acgt_cycles, n*4*sizeof(uint64_t));
557 if ( !stats->acgt_cycles )
558 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*4*sizeof(uint64_t));
559 memset(stats->acgt_cycles + stats->nbases*4, 0, (n-stats->nbases)*4*sizeof(uint64_t));
561 stats->read_lengths = realloc(stats->read_lengths, n*sizeof(uint64_t));
562 if ( !stats->read_lengths )
563 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
564 memset(stats->read_lengths + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
566 stats->insertions = realloc(stats->insertions, n*sizeof(uint64_t));
567 if ( !stats->insertions )
568 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
569 memset(stats->insertions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
571 stats->deletions = realloc(stats->deletions, n*sizeof(uint64_t));
572 if ( !stats->deletions )
573 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
574 memset(stats->deletions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
576 stats->ins_cycles_1st = realloc(stats->ins_cycles_1st, (n+1)*sizeof(uint64_t));
577 if ( !stats->ins_cycles_1st )
578 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
579 memset(stats->ins_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
581 stats->ins_cycles_2nd = realloc(stats->ins_cycles_2nd, (n+1)*sizeof(uint64_t));
582 if ( !stats->ins_cycles_2nd )
583 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
584 memset(stats->ins_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
586 stats->del_cycles_1st = realloc(stats->del_cycles_1st, (n+1)*sizeof(uint64_t));
587 if ( !stats->del_cycles_1st )
588 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
589 memset(stats->del_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
591 stats->del_cycles_2nd = realloc(stats->del_cycles_2nd, (n+1)*sizeof(uint64_t));
592 if ( !stats->del_cycles_2nd )
593 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
594 memset(stats->del_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
598 // Realloc the coverage distribution buffer
599 int *rbuffer = calloc(sizeof(int),seq_len*5);
600 n = stats->cov_rbuf.size-stats->cov_rbuf.start;
601 memcpy(rbuffer,stats->cov_rbuf.buffer+stats->cov_rbuf.start,n);
602 if ( stats->cov_rbuf.start>1 )
603 memcpy(rbuffer+n,stats->cov_rbuf.buffer,stats->cov_rbuf.start);
604 stats->cov_rbuf.start = 0;
605 free(stats->cov_rbuf.buffer);
606 stats->cov_rbuf.buffer = rbuffer;
607 stats->cov_rbuf.size = seq_len*5;
609 realloc_rseq_buffer(stats);
612 void collect_stats(bam1_t *bam_line, stats_t *stats)
614 if ( stats->rg_hash )
616 const uint8_t *rg = bam_aux_get(bam_line, "RG");
618 khiter_t k = kh_get(kh_rg, stats->rg_hash, (const char*)(rg + 1));
619 if ( k == kh_end(stats->rg_hash) ) return;
621 if ( stats->flag_require && (bam_line->core.flag & stats->flag_require)!=stats->flag_require )
623 if ( stats->flag_filter && (bam_line->core.flag & stats->flag_filter) )
625 if ( !is_in_regions(bam_line,stats) )
627 if ( stats->filter_readlen!=-1 && bam_line->core.l_qseq!=stats->filter_readlen )
630 if ( bam_line->core.flag & BAM_FQCFAIL ) stats->nreads_QCfailed++;
631 if ( bam_line->core.flag & BAM_FSECONDARY ) stats->nreads_secondary++;
633 int seq_len = bam_line->core.l_qseq;
634 if ( !seq_len ) return;
636 if ( seq_len >= stats->nbases )
637 realloc_buffers(stats,seq_len);
638 if ( stats->max_len<seq_len )
639 stats->max_len = seq_len;
641 stats->read_lengths[seq_len]++;
643 // Count GC and ACGT per cycle
644 uint8_t base, *seq = bam1_seq(bam_line);
647 int reverse = IS_REVERSE(bam_line);
648 for (i=0; i<seq_len; i++)
650 // Conversion from uint8_t coding to ACGT
654 base = bam1_seqi(seq,i);
656 if ( base==1 || base==2 ) gc_count++;
657 else if ( base>2 ) base=3;
658 if ( 4*(reverse ? seq_len-i-1 : i) + base >= stats->nbases*4 )
659 error("FIXME: acgt_cycles\n");
660 stats->acgt_cycles[ 4*(reverse ? seq_len-i-1 : i) + base ]++;
662 int gc_idx_min = gc_count*(stats->ngc-1)/seq_len;
663 int gc_idx_max = (gc_count+1)*(stats->ngc-1)/seq_len;
664 if ( gc_idx_max >= stats->ngc ) gc_idx_max = stats->ngc - 1;
666 // Determine which array (1st or 2nd read) will these stats go to,
667 // trim low quality bases from end the same way BWA does,
670 uint8_t *bam_quals = bam1_qual(bam_line);
671 if ( bam_line->core.flag&BAM_FREAD2 )
673 quals = stats->quals_2nd;
675 for (i=gc_idx_min; i<gc_idx_max; i++)
680 quals = stats->quals_1st;
682 for (i=gc_idx_min; i<gc_idx_max; i++)
685 if ( stats->trim_qual>0 )
686 stats->nbases_trimmed += bwa_trim_read(stats->trim_qual, bam_quals, seq_len, reverse);
688 // Quality histogram and average quality
689 for (i=0; i<seq_len; i++)
691 uint8_t qual = bam_quals[ reverse ? seq_len-i-1 : i];
692 if ( qual>=stats->nquals )
693 error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
694 if ( qual>stats->max_qual )
695 stats->max_qual = qual;
697 quals[ i*stats->nquals+qual ]++;
698 stats->sum_qual += qual;
701 // Look at the flags and increment appropriate counters (mapped, paired, etc)
702 if ( IS_UNMAPPED(bam_line) )
703 stats->nreads_unmapped++;
706 if ( !bam_line->core.qual )
709 count_indels(stats,bam_line);
711 if ( !IS_PAIRED(bam_line) )
712 stats->nreads_unpaired++;
715 stats->nreads_paired++;
717 if ( bam_line->core.tid!=bam_line->core.mtid )
718 stats->nreads_anomalous++;
720 // The insert size is tricky, because for long inserts the libraries are
721 // prepared differently and the pairs point in other direction. BWA does
722 // not set the paired flag for them. Similar thing is true also for 454
723 // reads. Mates mapped to different chromosomes have isize==0.
724 int32_t isize = bam_line->core.isize;
725 if ( isize<0 ) isize = -isize;
726 if ( isize >= stats->nisize )
727 isize = stats->nisize-1;
728 if ( isize>0 || bam_line->core.tid==bam_line->core.mtid )
730 int pos_fst = bam_line->core.mpos - bam_line->core.pos;
731 int is_fst = IS_READ1(bam_line) ? 1 : -1;
732 int is_fwd = IS_REVERSE(bam_line) ? -1 : 1;
733 int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1;
735 if ( is_fwd*is_mfwd>0 )
736 stats->isize_other[isize]++;
737 else if ( is_fst*pos_fst>0 )
739 if ( is_fst*is_fwd>0 )
740 stats->isize_inward[isize]++;
742 stats->isize_outward[isize]++;
744 else if ( is_fst*pos_fst<0 )
746 if ( is_fst*is_fwd>0 )
747 stats->isize_outward[isize]++;
749 stats->isize_inward[isize]++;
754 // Number of mismatches
755 uint8_t *nm = bam_aux_get(bam_line,"NM");
757 stats->nmismatches += bam_aux2i(nm);
759 // Number of mapped bases from cigar
760 // Conversion from uint32_t to MIDNSHP
763 if ( bam_line->core.n_cigar == 0)
764 error("FIXME: mapped read with no cigar?\n");
766 if ( stats->regions )
768 // Count only on-target bases
769 int iref = bam_line->core.pos + 1;
770 for (i=0; i<bam_line->core.n_cigar; i++)
772 int cig = bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK;
773 int ncig = bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
774 if ( cig==2 ) readlen += ncig;
777 if ( iref < stats->reg_from ) ncig -= stats->reg_from-iref;
778 else if ( iref+ncig-1 > stats->reg_to ) ncig -= iref+ncig-1 - stats->reg_to;
779 if ( ncig<0 ) ncig = 0;
780 stats->nbases_mapped_cigar += ncig;
781 iref += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
786 if ( iref>=stats->reg_from && iref<=stats->reg_to )
787 stats->nbases_mapped_cigar += ncig;
793 // Count the whole read
794 for (i=0; i<bam_line->core.n_cigar; i++)
796 if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==0 || (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==1 )
797 stats->nbases_mapped_cigar += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
798 if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==2 )
799 readlen += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
802 stats->nbases_mapped += seq_len;
804 if ( stats->tid==bam_line->core.tid && bam_line->core.pos<stats->pos )
805 stats->is_sorted = 0;
806 stats->pos = bam_line->core.pos;
808 if ( stats->is_sorted )
810 if ( stats->tid==-1 || stats->tid!=bam_line->core.tid )
811 round_buffer_flush(stats,-1);
813 // Mismatches per cycle and GC-depth graph. For simplicity, reads overlapping GCD bins
814 // are not splitted which results in up to seq_len-1 overlaps. The default bin size is
815 // 20kbp, so the effect is negligible.
818 int inc_ref = 0, inc_gcd = 0;
819 // First pass or new chromosome
820 if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid ) { inc_ref=1; inc_gcd=1; }
821 // Read goes beyond the end of the rseq buffer
822 else if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+readlen ) { inc_ref=1; inc_gcd=1; }
823 // Read overlaps the next gcd bin
824 else if ( stats->gcd_pos+stats->gcd_bin_size < bam_line->core.pos+readlen )
827 if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+stats->gcd_bin_size ) inc_ref = 1;
832 if ( stats->igcd >= stats->ngcd )
833 realloc_gcd_buffer(stats, readlen);
835 read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos);
836 stats->gcd_pos = bam_line->core.pos;
837 stats->gcd[ stats->igcd ].gc = fai_gc_content(stats, stats->gcd_pos, stats->gcd_bin_size);
840 count_mismatches_per_cycle(stats,bam_line);
842 // No reference and first pass, new chromosome or sequence going beyond the end of the gcd bin
843 else if ( stats->gcd_pos==-1 || stats->tid != bam_line->core.tid || bam_line->core.pos - stats->gcd_pos > stats->gcd_bin_size )
845 // First pass or a new chromosome
846 stats->tid = bam_line->core.tid;
847 stats->gcd_pos = bam_line->core.pos;
849 if ( stats->igcd >= stats->ngcd )
850 realloc_gcd_buffer(stats, readlen);
852 stats->gcd[ stats->igcd ].depth++;
853 // When no reference sequence is given, approximate the GC from the read (much shorter window, but otherwise OK)
855 stats->gcd[ stats->igcd ].gc += (float) gc_count / seq_len;
857 // Coverage distribution graph
858 round_buffer_flush(stats,bam_line->core.pos);
859 round_buffer_insert_read(&(stats->cov_rbuf),bam_line->core.pos,bam_line->core.pos+seq_len-1);
863 stats->total_len += seq_len;
864 if ( IS_DUP(bam_line) )
866 stats->total_len_dup += seq_len;
871 // Sort by GC and depth
872 #define GCD_t(x) ((gc_depth_t *)x)
873 static int gcd_cmp(const void *a, const void *b)
875 if ( GCD_t(a)->gc < GCD_t(b)->gc ) return -1;
876 if ( GCD_t(a)->gc > GCD_t(b)->gc ) return 1;
877 if ( GCD_t(a)->depth < GCD_t(b)->depth ) return -1;
878 if ( GCD_t(a)->depth > GCD_t(b)->depth ) return 1;
883 float gcd_percentile(gc_depth_t *gcd, int N, int p)
893 return gcd[N-1].depth;
896 return gcd[k-1].depth + d*(gcd[k].depth - gcd[k-1].depth);
899 void output_stats(stats_t *stats)
901 // Calculate average insert size and standard deviation (from the main bulk data only)
903 uint64_t nisize=0, nisize_inward=0, nisize_outward=0, nisize_other=0;
904 for (isize=0; isize<stats->nisize; isize++)
906 // Each pair was counted twice
907 stats->isize_inward[isize] *= 0.5;
908 stats->isize_outward[isize] *= 0.5;
909 stats->isize_other[isize] *= 0.5;
911 nisize_inward += stats->isize_inward[isize];
912 nisize_outward += stats->isize_outward[isize];
913 nisize_other += stats->isize_other[isize];
914 nisize += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
917 double bulk=0, avg_isize=0, sd_isize=0;
918 for (isize=0; isize<stats->nisize; isize++)
920 bulk += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
921 avg_isize += isize * (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]);
923 if ( bulk/nisize > stats->isize_main_bulk )
930 avg_isize /= nisize ? nisize : 1;
931 for (isize=1; isize<ibulk; isize++)
932 sd_isize += (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]) * (isize-avg_isize)*(isize-avg_isize) / nisize;
933 sd_isize = sqrt(sd_isize);
936 printf("# This file was produced by bamcheck (%s)\n",BAMCHECK_VERSION);
937 printf("# The command line was: %s",stats->argv[0]);
939 for (i=1; i<stats->argc; i++)
940 printf(" %s",stats->argv[i]);
942 printf("# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n");
943 printf("SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd));
944 printf("SN\tis paired:\t%d\n", stats->nreads_1st&&stats->nreads_2nd ? 1 : 0);
945 printf("SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0);
946 printf("SN\t1st fragments:\t%ld\n", (long)stats->nreads_1st);
947 printf("SN\tlast fragments:\t%ld\n", (long)stats->nreads_2nd);
948 printf("SN\treads mapped:\t%ld\n", (long)(stats->nreads_paired+stats->nreads_unpaired));
949 printf("SN\treads unmapped:\t%ld\n", (long)stats->nreads_unmapped);
950 printf("SN\treads unpaired:\t%ld\n", (long)stats->nreads_unpaired);
951 printf("SN\treads paired:\t%ld\n", (long)stats->nreads_paired);
952 printf("SN\treads duplicated:\t%ld\n", (long)stats->nreads_dup);
953 printf("SN\treads MQ0:\t%ld\n", (long)stats->nreads_mq0);
954 printf("SN\treads QC failed:\t%ld\n", (long)stats->nreads_QCfailed);
955 printf("SN\tnon-primary alignments:\t%ld\n", (long)stats->nreads_secondary);
956 printf("SN\ttotal length:\t%ld\n", (long)stats->total_len);
957 printf("SN\tbases mapped:\t%ld\n", (long)stats->nbases_mapped);
958 printf("SN\tbases mapped (cigar):\t%ld\n", (long)stats->nbases_mapped_cigar);
959 printf("SN\tbases trimmed:\t%ld\n", (long)stats->nbases_trimmed);
960 printf("SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup);
961 printf("SN\tmismatches:\t%ld\n", (long)stats->nmismatches);
962 printf("SN\terror rate:\t%e\n", (float)stats->nmismatches/stats->nbases_mapped_cigar);
963 float avg_read_length = (stats->nreads_1st+stats->nreads_2nd)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd):0;
964 printf("SN\taverage length:\t%.0f\n", avg_read_length);
965 printf("SN\tmaximum length:\t%d\n", stats->max_len);
966 printf("SN\taverage quality:\t%.1f\n", stats->total_len?stats->sum_qual/stats->total_len:0);
967 printf("SN\tinsert size average:\t%.1f\n", avg_isize);
968 printf("SN\tinsert size standard deviation:\t%.1f\n", sd_isize);
969 printf("SN\tinward oriented pairs:\t%ld\n", (long)nisize_inward);
970 printf("SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward);
971 printf("SN\tpairs with other orientation:\t%ld\n", (long)nisize_other);
972 printf("SN\tpairs on different chromosomes:\t%ld\n", (long)stats->nreads_anomalous/2);
975 if ( stats->max_len<stats->nbases ) stats->max_len++;
976 if ( stats->max_qual+1<stats->nquals ) stats->max_qual++;
977 printf("# First Fragment Qualitites. Use `grep ^FFQ | cut -f 2-` to extract this part.\n");
978 printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
979 for (ibase=0; ibase<stats->max_len; ibase++)
981 printf("FFQ\t%d",ibase+1);
982 for (iqual=0; iqual<=stats->max_qual; iqual++)
984 printf("\t%ld", (long)stats->quals_1st[ibase*stats->nquals+iqual]);
988 printf("# Last Fragment Qualitites. Use `grep ^LFQ | cut -f 2-` to extract this part.\n");
989 printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
990 for (ibase=0; ibase<stats->max_len; ibase++)
992 printf("LFQ\t%d",ibase+1);
993 for (iqual=0; iqual<=stats->max_qual; iqual++)
995 printf("\t%ld", (long)stats->quals_2nd[ibase*stats->nquals+iqual]);
999 if ( stats->mpc_buf )
1001 printf("# Mismatches per cycle and quality. Use `grep ^MPC | cut -f 2-` to extract this part.\n");
1002 printf("# Columns correspond to qualities, rows to cycles. First column is the cycle number, second\n");
1003 printf("# is the number of N's and the rest is the number of mismatches\n");
1004 for (ibase=0; ibase<stats->max_len; ibase++)
1006 printf("MPC\t%d",ibase+1);
1007 for (iqual=0; iqual<=stats->max_qual; iqual++)
1009 printf("\t%ld", (long)stats->mpc_buf[ibase*stats->nquals+iqual]);
1014 printf("# GC Content of first fragments. Use `grep ^GCF | cut -f 2-` to extract this part.\n");
1016 for (ibase=0; ibase<stats->ngc; ibase++)
1018 if ( stats->gc_1st[ibase]==stats->gc_1st[ibase_prev] ) continue;
1019 printf("GCF\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_1st[ibase_prev]);
1022 printf("# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part.\n");
1024 for (ibase=0; ibase<stats->ngc; ibase++)
1026 if ( stats->gc_2nd[ibase]==stats->gc_2nd[ibase_prev] ) continue;
1027 printf("GCL\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_2nd[ibase_prev]);
1030 printf("# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle, and A,C,G,T counts [%%]\n");
1031 for (ibase=0; ibase<stats->max_len; ibase++)
1033 uint64_t *ptr = &(stats->acgt_cycles[ibase*4]);
1034 uint64_t sum = ptr[0]+ptr[1]+ptr[2]+ptr[3];
1035 if ( ! sum ) continue;
1036 printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum);
1038 printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n");
1039 for (isize=0; isize<ibulk; isize++)
1040 printf("IS\t%d\t%ld\t%ld\t%ld\t%ld\n", isize, (long)(stats->isize_inward[isize]+stats->isize_outward[isize]+stats->isize_other[isize]),
1041 (long)stats->isize_inward[isize], (long)stats->isize_outward[isize], (long)stats->isize_other[isize]);
1043 printf("# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count\n");
1045 for (ilen=0; ilen<stats->max_len; ilen++)
1047 if ( stats->read_lengths[ilen]>0 )
1048 printf("RL\t%d\t%ld\n", ilen, (long)stats->read_lengths[ilen]);
1051 printf("# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n");
1052 for (ilen=0; ilen<stats->nindels; ilen++)
1054 if ( stats->insertions[ilen]>0 || stats->deletions[ilen]>0 )
1055 printf("ID\t%d\t%ld\t%ld\n", ilen+1, (long)stats->insertions[ilen], (long)stats->deletions[ilen]);
1058 printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev)\n");
1059 for (ilen=0; ilen<=stats->nbases; ilen++)
1061 // For deletions we print the index of the cycle before the deleted base (1-based) and for insertions
1062 // the index of the cycle of the first inserted base (also 1-based)
1063 if ( stats->ins_cycles_1st[ilen]>0 || stats->ins_cycles_2nd[ilen]>0 || stats->del_cycles_1st[ilen]>0 || stats->del_cycles_2nd[ilen]>0 )
1064 printf("IC\t%d\t%ld\t%ld\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles_1st[ilen], (long)stats->ins_cycles_2nd[ilen], (long)stats->del_cycles_1st[ilen], (long)stats->del_cycles_2nd[ilen]);
1067 printf("# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n");
1068 if ( stats->cov[0] )
1069 printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1, (long)stats->cov[0]);
1071 for (icov=1; icov<stats->ncov-1; icov++)
1072 if ( stats->cov[icov] )
1073 printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1, (long)stats->cov[icov]);
1074 if ( stats->cov[stats->ncov-1] )
1075 printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1, (long)stats->cov[stats->ncov-1]);
1077 // Calculate average GC content, then sort by GC and depth
1078 printf("# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n");
1080 for (igcd=0; igcd<stats->igcd; igcd++)
1083 stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc);
1085 if ( stats->gcd[igcd].depth )
1086 stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc / stats->gcd[igcd].depth);
1088 qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp);
1090 while ( igcd < stats->igcd )
1092 // Calculate percentiles (10,25,50,75,90th) for the current GC content and print
1093 uint32_t nbins=0, itmp=igcd;
1094 float gc = stats->gcd[igcd].gc;
1095 while ( itmp<stats->igcd && fabs(stats->gcd[itmp].gc-gc)<0.1 )
1100 printf("GCD\t%.1f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", gc, (igcd+nbins+1)*100./(stats->igcd+1),
1101 gcd_percentile(&(stats->gcd[igcd]),nbins,10) *avg_read_length/stats->gcd_bin_size,
1102 gcd_percentile(&(stats->gcd[igcd]),nbins,25) *avg_read_length/stats->gcd_bin_size,
1103 gcd_percentile(&(stats->gcd[igcd]),nbins,50) *avg_read_length/stats->gcd_bin_size,
1104 gcd_percentile(&(stats->gcd[igcd]),nbins,75) *avg_read_length/stats->gcd_bin_size,
1105 gcd_percentile(&(stats->gcd[igcd]),nbins,90) *avg_read_length/stats->gcd_bin_size
1111 size_t mygetline(char **line, size_t *n, FILE *fp)
1113 if (line == NULL || n == NULL || fp == NULL)
1118 if (*n==0 || !*line)
1126 while ((c=getc(fp))!= EOF && c!='\n')
1131 *line = realloc(*line, sizeof(char)*(*n));
1133 (*line)[nread-1] = c;
1138 *line = realloc(*line, sizeof(char)*(*n));
1141 return nread>0 ? nread : -1;
1145 void init_regions(stats_t *stats, char *file)
1148 khash_t(kh_bam_tid) *header_hash;
1150 bam_init_header_hash(stats->sam->header);
1151 header_hash = (khash_t(kh_bam_tid)*)stats->sam->header->hash;
1153 FILE *fp = fopen(file,"r");
1154 if ( !fp ) error("%s: %s\n",file,strerror(errno));
1160 int prev_tid=-1, prev_pos=-1;
1161 while ((nread = mygetline(&line, &len, fp)) != -1)
1163 if ( line[0] == '#' ) continue;
1166 while ( i<nread && !isspace(line[i]) ) i++;
1167 if ( i>=nread ) error("Could not parse the file: %s [%s]\n", file,line);
1170 iter = kh_get(kh_bam_tid, header_hash, line);
1171 int tid = kh_val(header_hash, iter);
1172 if ( iter == kh_end(header_hash) )
1175 fprintf(stderr,"Warning: Some sequences not present in the BAM, e.g. \"%s\". This message is printed only once.\n", line);
1180 if ( tid >= stats->nregions )
1182 stats->regions = realloc(stats->regions,sizeof(regions_t)*(stats->nregions+100));
1184 for (j=stats->nregions; j<stats->nregions+100; j++)
1186 stats->regions[j].npos = stats->regions[j].mpos = stats->regions[j].cpos = 0;
1187 stats->regions[j].pos = NULL;
1189 stats->nregions += 100;
1191 int npos = stats->regions[tid].npos;
1192 if ( npos >= stats->regions[tid].mpos )
1194 stats->regions[tid].mpos += 1000;
1195 stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos);
1198 if ( (sscanf(line+i+1,"%d %d",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n");
1199 if ( prev_tid==-1 || prev_tid!=tid )
1202 prev_pos = stats->regions[tid].pos[npos].from;
1204 if ( prev_pos>stats->regions[tid].pos[npos].from )
1205 error("The positions are not in chromosomal order (%s:%d comes after %d)\n", line,stats->regions[tid].pos[npos].from,prev_pos);
1206 stats->regions[tid].npos++;
1208 if (line) free(line);
1209 if ( !stats->regions ) error("Unable to map the -t sequences to the BAM sequences.\n");
1213 void destroy_regions(stats_t *stats)
1216 for (i=0; i<stats->nregions; i++)
1218 if ( !stats->regions[i].mpos ) continue;
1219 free(stats->regions[i].pos);
1221 if ( stats->regions ) free(stats->regions);
1224 static int fetch_read(const bam1_t *bam_line, void *data)
1226 collect_stats((bam1_t*)bam_line,(stats_t*)data);
1230 void reset_regions(stats_t *stats)
1233 for (i=0; i<stats->nregions; i++)
1234 stats->regions[i].cpos = 0;
1237 int is_in_regions(bam1_t *bam_line, stats_t *stats)
1239 if ( !stats->regions ) return 1;
1241 if ( bam_line->core.tid >= stats->nregions || bam_line->core.tid<0 ) return 0;
1242 if ( !stats->is_sorted ) error("The BAM must be sorted in order for -t to work.\n");
1244 regions_t *reg = &stats->regions[bam_line->core.tid];
1245 if ( reg->cpos==reg->npos ) return 0; // done for this chr
1247 // Find a matching interval or skip this read. No splicing of reads is done, no indels or soft clips considered,
1248 // even small overlap is enough to include the read in the stats.
1250 while ( i<reg->npos && reg->pos[i].to<=bam_line->core.pos ) i++;
1251 if ( i>=reg->npos ) { reg->cpos = reg->npos; return 0; }
1252 if ( bam_line->core.pos + bam_line->core.l_qseq + 1 < reg->pos[i].from ) return 0;
1254 stats->reg_from = reg->pos[i].from;
1255 stats->reg_to = reg->pos[i].to;
1260 void init_group_id(stats_t *stats, char *id)
1262 if ( !stats->sam->header->dict )
1263 stats->sam->header->dict = sam_header_parse2(stats->sam->header->text);
1264 void *iter = stats->sam->header->dict;
1265 const char *key, *val;
1267 stats->rg_hash = kh_init(kh_rg);
1268 while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) )
1270 if ( !strcmp(id,key) || (val && !strcmp(id,val)) )
1272 khiter_t k = kh_get(kh_rg, stats->rg_hash, key);
1273 if ( k != kh_end(stats->rg_hash) )
1274 fprintf(stderr, "[init_group_id] The group ID not unique: \"%s\"\n", key);
1276 k = kh_put(kh_rg, stats->rg_hash, key, &ret);
1277 kh_value(stats->rg_hash, k) = val;
1282 error("The sample or read group \"%s\" not present.\n", id);
1286 void error(const char *format, ...)
1290 printf("Version: %s\n", BAMCHECK_VERSION);
1291 printf("About: The program collects statistics from BAM files. The output can be visualized using plot-bamcheck.\n");
1292 printf("Usage: bamcheck [OPTIONS] file.bam\n");
1293 printf(" bamcheck [OPTIONS] file.bam chr:from-to\n");
1294 printf("Options:\n");
1295 printf(" -c, --coverage <int>,<int>,<int> Coverage distribution min,max,step [1,1000,1]\n");
1296 printf(" -d, --remove-dups Exlude from statistics reads marked as duplicates\n");
1297 printf(" -f, --required-flag <int> Required flag, 0 for unset [0]\n");
1298 printf(" -F, --filtering-flag <int> Filtering flag, 0 for unset [0]\n");
1299 printf(" --GC-depth <float,float> Bin size for GC-depth graph and the maximum reference length [2e4,4.2e9]\n");
1300 printf(" -h, --help This help message\n");
1301 printf(" -i, --insert-size <int> Maximum insert size [8000]\n");
1302 printf(" -I, --id <string> Include only listed read group or sample name\n");
1303 printf(" -l, --read-length <int> Include in the statistics only reads with the given read length []\n");
1304 printf(" -m, --most-inserts <float> Report only the main part of inserts [0.99]\n");
1305 printf(" -q, --trim-quality <int> The BWA trimming parameter [0]\n");
1306 printf(" -r, --ref-seq <file> Reference sequence (required for GC-depth calculation).\n");
1307 printf(" -t, --target-regions <file> Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n");
1308 printf(" -s, --sam Input is SAM\n");
1314 va_start(ap, format);
1315 vfprintf(stderr, format, ap);
1321 int main(int argc, char *argv[])
1323 char *targets = NULL;
1324 char *bam_fname = NULL;
1325 char *group_id = NULL;
1326 samfile_t *sam = NULL;
1329 stats_t *stats = calloc(1,sizeof(stats_t));
1331 stats->nquals = 256;
1332 stats->nbases = 300;
1333 stats->nisize = 8000;
1334 stats->max_len = 30;
1335 stats->max_qual = 40;
1336 stats->isize_main_bulk = 0.99; // There are always outliers at the far end
1337 stats->gcd_bin_size = 20e3;
1338 stats->gcd_ref_size = 4.2e9;
1339 stats->rseq_pos = -1;
1340 stats->tid = stats->gcd_pos = -1;
1342 stats->is_sorted = 1;
1344 stats->cov_max = 1000;
1345 stats->cov_step = 1;
1348 stats->filter_readlen = -1;
1349 stats->nindels = stats->nbases;
1351 strcpy(in_mode, "rb");
1353 static struct option loptions[] =
1356 {"remove-dups",0,0,'d'},
1358 {"ref-seq",1,0,'r'},
1359 {"coverage",1,0,'c'},
1360 {"read-length",1,0,'l'},
1361 {"insert-size",1,0,'i'},
1362 {"most-inserts",1,0,'m'},
1363 {"trim-quality",1,0,'q'},
1364 {"target-regions",0,0,'t'},
1365 {"required-flag",1,0,'f'},
1366 {"filtering-flag",0,0,'F'},
1372 while ( (opt=getopt_long(argc,argv,"?hdsr:c:l:i:t:m:q:f:F:I:1:",loptions,NULL))>0 )
1376 case 'f': stats->flag_require=strtol(optarg,0,0); break;
1377 case 'F': stats->flag_filter=strtol(optarg,0,0); break;
1378 case 'd': stats->flag_filter|=BAM_FDUP; break;
1379 case 's': strcpy(in_mode, "r"); break;
1380 case 'r': stats->fai = fai_load(optarg);
1382 error("Could not load faidx: %s\n", optarg);
1386 if ( sscanf(optarg,"%f,%f",&fbin,&flen)!= 2 )
1387 error("Unable to parse --GC-depth %s\n", optarg);
1388 stats->gcd_bin_size = fbin;
1389 stats->gcd_ref_size = flen;
1392 case 'c': if ( sscanf(optarg,"%d,%d,%d",&stats->cov_min,&stats->cov_max,&stats->cov_step)!= 3 )
1393 error("Unable to parse -c %s\n", optarg);
1395 case 'l': stats->filter_readlen = atoi(optarg); break;
1396 case 'i': stats->nisize = atoi(optarg); break;
1397 case 'm': stats->isize_main_bulk = atof(optarg); break;
1398 case 'q': stats->trim_qual = atoi(optarg); break;
1399 case 't': targets = optarg; break;
1400 case 'I': group_id = optarg; break;
1402 case 'h': error(NULL);
1403 default: error("Unknown argument: %s\n", optarg);
1407 bam_fname = argv[optind++];
1411 if ( isatty(fileno((FILE *)stdin)) )
1417 // .. coverage bins and round buffer
1418 if ( stats->cov_step > stats->cov_max - stats->cov_min + 1 )
1420 stats->cov_step = stats->cov_max - stats->cov_min;
1421 if ( stats->cov_step <= 0 )
1422 stats->cov_step = 1;
1424 stats->ncov = 3 + (stats->cov_max-stats->cov_min) / stats->cov_step;
1425 stats->cov_max = stats->cov_min + ((stats->cov_max-stats->cov_min)/stats->cov_step +1)*stats->cov_step - 1;
1426 stats->cov = calloc(sizeof(uint64_t),stats->ncov);
1427 stats->cov_rbuf.size = stats->nbases*5;
1428 stats->cov_rbuf.buffer = calloc(sizeof(int32_t),stats->cov_rbuf.size);
1430 if ((sam = samopen(bam_fname, in_mode, NULL)) == 0)
1431 error("Failed to open: %s\n", bam_fname);
1433 if ( group_id ) init_group_id(stats, group_id);
1434 bam1_t *bam_line = bam_init1();
1436 stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
1437 stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
1438 stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t));
1439 stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t));
1440 stats->isize_inward = calloc(stats->nisize,sizeof(uint64_t));
1441 stats->isize_outward = calloc(stats->nisize,sizeof(uint64_t));
1442 stats->isize_other = calloc(stats->nisize,sizeof(uint64_t));
1443 stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t));
1444 stats->mpc_buf = stats->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL;
1445 stats->acgt_cycles = calloc(4*stats->nbases,sizeof(uint64_t));
1446 stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t));
1447 stats->insertions = calloc(stats->nbases,sizeof(uint64_t));
1448 stats->deletions = calloc(stats->nbases,sizeof(uint64_t));
1449 stats->ins_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
1450 stats->ins_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
1451 stats->del_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
1452 stats->del_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
1453 realloc_rseq_buffer(stats);
1455 init_regions(stats, targets);
1457 // Collect statistics
1460 // Collect stats in selected regions only
1461 bam_index_t *bam_idx = bam_index_load(bam_fname);
1463 error("Random alignment retrieval only works for indexed BAM files.\n");
1466 for (i=optind; i<argc; i++)
1469 bam_parse_region(stats->sam->header, argv[i], &tid, &beg, &end);
1470 if ( tid < 0 ) continue;
1471 reset_regions(stats);
1472 bam_fetch(stats->sam->x.bam, bam_idx, tid, beg, end, stats, fetch_read);
1474 bam_index_destroy(bam_idx);
1478 // Stream through the entire BAM ignoring off-target regions if -t is given
1479 while (samread(sam,bam_line) >= 0)
1480 collect_stats(bam_line,stats);
1482 round_buffer_flush(stats,-1);
1484 output_stats(stats);
1486 bam_destroy1(bam_line);
1487 samclose(stats->sam);
1488 if (stats->fai) fai_destroy(stats->fai);
1489 free(stats->cov_rbuf.buffer); free(stats->cov);
1490 free(stats->quals_1st); free(stats->quals_2nd);
1491 free(stats->gc_1st); free(stats->gc_2nd);
1492 free(stats->isize_inward); free(stats->isize_outward); free(stats->isize_other);
1494 free(stats->rseq_buf);
1495 free(stats->mpc_buf);
1496 free(stats->acgt_cycles);
1497 free(stats->read_lengths);
1498 free(stats->insertions);
1499 free(stats->deletions);
1500 free(stats->ins_cycles_1st);
1501 free(stats->ins_cycles_2nd);
1502 free(stats->del_cycles_1st);
1503 free(stats->del_cycles_2nd);
1504 destroy_regions(stats);
1506 if ( stats->rg_hash ) kh_destroy(kh_rg, stats->rg_hash);