2 Author: petr.danecek@sanger
3 gcc -Wall -Winline -g -O2 -I ~/git/samtools bamcheck.c -o bamcheck -lm -lz -L ~/git/samtools -lbam -lpthread
5 Assumptions, approximations and other issues:
6 - GC-depth graph does not split reads, the starting position determines which bin is incremented.
7 There are small overlaps between bins (max readlen-1). However, the bins are big (20k).
8 - coverage distribution ignores softclips and deletions
9 - some stats require sorted BAMs
10 - GC content graph can have an untidy, step-like pattern when BAM contains multiple read lengths.
11 - 'bases mapped' (stats->nbases_mapped) is calculated from read lengths given by BAM (core.l_qseq)
12 - With the -t option, the whole reads are used. Except for the number of mapped bases (cigar)
13 counts, no splicing is done, no indels or soft clips are considered, even small overlap is
14 good enough to include the read in the stats.
18 #define BAMCHECK_VERSION "2012-09-04"
20 #define _ISOC99_SOURCE
33 #include "sam_header.h"
36 #define BWA_MIN_RDLEN 35
37 #define IS_PAIRED(bam) ((bam)->core.flag&BAM_FPAIRED && !((bam)->core.flag&BAM_FUNMAP) && !((bam)->core.flag&BAM_FMUNMAP))
38 #define IS_UNMAPPED(bam) ((bam)->core.flag&BAM_FUNMAP)
39 #define IS_REVERSE(bam) ((bam)->core.flag&BAM_FREVERSE)
40 #define IS_MATE_REVERSE(bam) ((bam)->core.flag&BAM_FMREVERSE)
41 #define IS_READ1(bam) ((bam)->core.flag&BAM_FREAD1)
42 #define IS_READ2(bam) ((bam)->core.flag&BAM_FREAD2)
43 #define IS_DUP(bam) ((bam)->core.flag&BAM_FDUP)
47 int32_t line_len, line_blen;
52 KHASH_MAP_INIT_STR(kh_faidx, faidx1_t)
53 KHASH_MAP_INIT_STR(kh_bam_tid, int)
54 KHASH_MAP_INIT_STR(kh_rg, const char *)
59 khash_t(kh_faidx) *hash;
69 // For coverage distribution, a simple pileup
78 typedef struct { uint32_t from, to; } pos_t;
89 int trim_qual; // bwa trim quality
91 // Dimensions of the quality histogram holder (quals_1st,quals_2nd), GC content holder (gc_1st,gc_2nd),
92 // insert size histogram holder
93 int nquals; // The number of quality bins
94 int nbases; // The maximum sequence length the allocated array can hold
95 int nisize; // The maximum insert size that the allocated array can hold
96 int ngc; // The size of gc_1st and gc_2nd
97 int nindels; // The maximum indel length for indel distribution
99 // Arrays for the histogram data
100 uint64_t *quals_1st, *quals_2nd;
101 uint64_t *gc_1st, *gc_2nd;
102 uint64_t *isize_inward, *isize_outward, *isize_other;
103 uint64_t *acgt_cycles;
104 uint64_t *read_lengths;
105 uint64_t *insertions, *deletions;
106 uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd;
108 // The extremes encountered
109 int max_len; // Maximum read length
110 int max_qual; // Maximum quality
111 float isize_main_bulk; // There are always some unrealistically big insert sizes, report only the main part
116 uint64_t total_len_dup;
120 uint64_t nreads_unmapped;
121 uint64_t nreads_unpaired;
122 uint64_t nreads_paired;
123 uint64_t nreads_anomalous;
125 uint64_t nbases_mapped;
126 uint64_t nbases_mapped_cigar;
127 uint64_t nbases_trimmed; // bwa trimmed bases
128 uint64_t nmismatches;
130 // GC-depth related data
131 uint32_t ngcd, igcd; // The maximum number of GC depth bins and index of the current bin
132 gc_depth_t *gcd; // The GC-depth bins holder
133 int gcd_bin_size; // The size of GC-depth bin
134 uint32_t gcd_ref_size; // The approximate size of the genome
135 int32_t tid, gcd_pos; // Position of the current bin
136 int32_t pos; // Position of the last read
138 // Coverage distribution related data
139 int ncov; // The number of coverage bins
140 uint64_t *cov; // The coverage frequencies
141 int cov_min,cov_max,cov_step; // Minimum, maximum coverage and size of the coverage bins
142 round_buffer_t cov_rbuf; // Pileup round buffer
144 // Mismatches by read cycle
145 uint8_t *rseq_buf; // A buffer for reference sequence to check the mismatches against
146 int mrseq_buf; // The size of the buffer
147 int32_t rseq_pos; // The coordinate of the first base in the buffer
148 int32_t nrseq_buf; // The used part of the buffer
149 uint64_t *mpc_buf; // Mismatches per cycle
155 int nregions, reg_from,reg_to;
159 int flag_require, flag_filter;
160 double sum_qual; // For calculating average quality value
162 khash_t(kh_rg) *rg_hash; // Read groups to include, the array is null-terminated
163 faidx_t *fai; // Reference sequence for GC-depth graph
164 int argc; // Command line arguments to be printed on the output
169 void error(const char *format, ...);
170 void bam_init_header_hash(bam_header_t *header);
171 int is_in_regions(bam1_t *bam_line, stats_t *stats);
174 // Coverage distribution methods
175 inline int coverage_idx(int min, int max, int n, int step, int depth)
183 return 1 + (depth - min) / step;
186 inline int round_buffer_lidx2ridx(int offset, int size, int64_t refpos, int64_t pos)
188 return (offset + (pos-refpos) % size) % size;
191 void round_buffer_flush(stats_t *stats, int64_t pos)
195 if ( pos==stats->cov_rbuf.pos )
198 int64_t new_pos = pos;
199 if ( pos==-1 || pos - stats->cov_rbuf.pos >= stats->cov_rbuf.size )
201 // Flush the whole buffer, but in sequential order,
202 pos = stats->cov_rbuf.pos + stats->cov_rbuf.size - 1;
205 if ( pos < stats->cov_rbuf.pos )
206 error("Expected coordinates in ascending order, got %ld after %ld\n", pos,stats->cov_rbuf.pos);
208 int ifrom = stats->cov_rbuf.start;
209 int ito = round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos-1);
212 for (ibuf=ifrom; ibuf<stats->cov_rbuf.size; ibuf++)
214 if ( !stats->cov_rbuf.buffer[ibuf] )
216 idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
218 stats->cov_rbuf.buffer[ibuf] = 0;
222 for (ibuf=ifrom; ibuf<=ito; ibuf++)
224 if ( !stats->cov_rbuf.buffer[ibuf] )
226 idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
228 stats->cov_rbuf.buffer[ibuf] = 0;
230 stats->cov_rbuf.start = (new_pos==-1) ? 0 : round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos);
231 stats->cov_rbuf.pos = new_pos;
234 void round_buffer_insert_read(round_buffer_t *rbuf, int64_t from, int64_t to)
236 if ( to-from >= rbuf->size )
237 error("The read length too big (%d), please increase the buffer length (currently %d)\n", to-from+1,rbuf->size);
238 if ( from < rbuf->pos )
239 error("The reads are not sorted (%ld comes after %ld).\n", from,rbuf->pos);
242 ifrom = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,from);
243 ito = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,to);
246 for (ibuf=ifrom; ibuf<rbuf->size; ibuf++)
247 rbuf->buffer[ibuf]++;
250 for (ibuf=ifrom; ibuf<=ito; ibuf++)
251 rbuf->buffer[ibuf]++;
254 // Calculate the number of bases in the read trimmed by BWA
255 int bwa_trim_read(int trim_qual, uint8_t *quals, int len, int reverse)
257 if ( len<BWA_MIN_RDLEN ) return 0;
259 // Although the name implies that the read cannot be trimmed to more than BWA_MIN_RDLEN,
260 // the calculation can in fact trim it to (BWA_MIN_RDLEN-1). (bwa_trim_read in bwa/bwaseqio.c).
261 int max_trimmed = len - BWA_MIN_RDLEN + 1;
262 int l, sum=0, max_sum=0, max_l=0;
264 for (l=0; l<max_trimmed; l++)
266 sum += trim_qual - quals[ reverse ? l : len-1-l ];
271 // This is the correct way, but bwa clips from some reason one base less
280 void count_indels(stats_t *stats,bam1_t *bam_line)
282 int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
283 int is_1st = IS_READ1(bam_line) ? 1 : 0;
286 int read_len = bam_line->core.l_qseq;
287 for (icig=0; icig<bam_line->core.n_cigar; icig++)
289 // Conversion from uint32_t to MIDNSHP
292 int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
293 int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
297 int idx = is_fwd ? icycle : read_len-icycle-ncig;
299 error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle);
300 if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%d %s\n", idx,stats->nbases, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
302 stats->ins_cycles_1st[idx]++;
304 stats->ins_cycles_2nd[idx]++;
306 if ( ncig<=stats->nindels )
307 stats->insertions[ncig-1]++;
312 int idx = is_fwd ? icycle-1 : read_len-icycle-1;
313 if ( idx<0 ) continue; // discard meaningless deletions
314 if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases);
316 stats->del_cycles_1st[idx]++;
318 stats->del_cycles_2nd[idx]++;
319 if ( ncig<=stats->nindels )
320 stats->deletions[ncig-1]++;
323 if ( cig!=3 && cig!=5 )
328 void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line)
330 int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
331 int icig,iread=0,icycle=0;
332 int iref = bam_line->core.pos - stats->rseq_pos;
333 int read_len = bam_line->core.l_qseq;
334 uint8_t *read = bam1_seq(bam_line);
335 uint8_t *quals = bam1_qual(bam_line);
336 uint64_t *mpc_buf = stats->mpc_buf;
337 for (icig=0; icig<bam_line->core.n_cigar; icig++)
339 // Conversion from uint32_t to MIDNSHP
342 int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
343 int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
358 // Soft-clips are present in the sequence, but the position of the read marks a start of non-clipped sequence
368 // Ignore H and N CIGARs. The letter are inserted e.g. by TopHat and often require very large
369 // chunk of refseq in memory. Not very frequent and not noticable in the stats.
370 if ( cig==3 || cig==5 ) continue;
372 error("TODO: cigar %d, %s:%d %s\n", cig,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
374 if ( ncig+iref > stats->nrseq_buf )
375 error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->nrseq_buf, bam1_qname(bam_line),stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1);
378 for (im=0; im<ncig; im++)
380 uint8_t cread = bam1_seqi(read,iread);
381 uint8_t cref = stats->rseq_buf[iref];
387 int idx = is_fwd ? icycle : read_len-icycle-1;
388 if ( idx>stats->max_len )
389 error("mpc: %d>%d\n",idx,stats->max_len);
390 idx = idx*stats->nquals;
391 if ( idx>=stats->nquals*stats->nbases )
392 error("FIXME: mpc_buf overflow\n");
395 else if ( cref && cread && cref!=cread )
397 uint8_t qual = quals[iread] + 1;
398 if ( qual>=stats->nquals )
399 error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
401 int idx = is_fwd ? icycle : read_len-icycle-1;
402 if ( idx>stats->max_len )
403 error("mpc: %d>%d\n",idx,stats->max_len);
405 idx = idx*stats->nquals + qual;
406 if ( idx>=stats->nquals*stats->nbases )
407 error("FIXME: mpc_buf overflow\n");
418 void read_ref_seq(stats_t *stats,int32_t tid,int32_t pos)
420 khash_t(kh_faidx) *h;
424 faidx_t *fai = stats->fai;
427 chr = stats->sam->header->target_name[tid];
429 // ID of the sequence name
430 iter = kh_get(kh_faidx, h, chr);
431 if (iter == kh_end(h))
432 error("No such reference sequence [%s]?\n", chr);
433 val = kh_value(h, iter);
435 // Check the boundaries
437 error("Was the bam file mapped with the reference sequence supplied?"
438 " A read mapped beyond the end of the chromosome (%s:%d, chromosome length %d).\n", chr,pos,val.len);
439 int size = stats->mrseq_buf;
440 // The buffer extends beyond the chromosome end. Later the rest will be filled with N's.
441 if (size+pos > val.len) size = val.len-pos;
443 // Position the razf reader
444 razf_seek(fai->rz, val.offset + pos / val.line_blen * val.line_len + pos % val.line_blen, SEEK_SET);
446 uint8_t *ptr = stats->rseq_buf;
448 while ( nread<size && razf_read(fai->rz,&c,1) && !fai->rz->z_err )
453 // Conversion between uint8_t coding and ACGT
456 if ( c=='A' || c=='a' )
458 else if ( c=='C' || c=='c' )
460 else if ( c=='G' || c=='g' )
462 else if ( c=='T' || c=='t' )
469 if ( nread < stats->mrseq_buf )
471 memset(ptr,0, stats->mrseq_buf - nread);
472 nread = stats->mrseq_buf;
474 stats->nrseq_buf = nread;
475 stats->rseq_pos = pos;
479 float fai_gc_content(stats_t *stats, int pos, int len)
482 int i = pos - stats->rseq_pos, ito = i + len;
483 assert( i>=0 && ito<=stats->nrseq_buf );
489 c = stats->rseq_buf[i];
495 else if ( c==1 || c==8 )
498 return count ? (float)gc/count : 0;
501 void realloc_rseq_buffer(stats_t *stats)
503 int n = stats->nbases*10;
504 if ( stats->gcd_bin_size > n ) n = stats->gcd_bin_size;
505 if ( stats->mrseq_buf<n )
507 stats->rseq_buf = realloc(stats->rseq_buf,sizeof(uint8_t)*n);
508 stats->mrseq_buf = n;
512 void realloc_gcd_buffer(stats_t *stats, int seq_len)
514 if ( seq_len >= stats->gcd_bin_size )
515 error("The --GC-depth bin size (%d) is set too low for the read length %d\n", stats->gcd_bin_size, seq_len);
517 int n = 1 + stats->gcd_ref_size / (stats->gcd_bin_size - seq_len);
518 if ( n <= stats->igcd )
519 error("The --GC-depth bin size is too small or reference genome too big; please decrease the bin size or increase the reference length\n");
521 if ( n > stats->ngcd )
523 stats->gcd = realloc(stats->gcd, n*sizeof(gc_depth_t));
525 error("Could not realloc GCD buffer, too many chromosomes or the genome too long?? [%u %u]\n", stats->ngcd,n);
526 memset(&(stats->gcd[stats->ngcd]),0,(n-stats->ngcd)*sizeof(gc_depth_t));
530 realloc_rseq_buffer(stats);
533 void realloc_buffers(stats_t *stats, int seq_len)
535 int n = 2*(1 + seq_len - stats->nbases) + stats->nbases;
537 stats->quals_1st = realloc(stats->quals_1st, n*stats->nquals*sizeof(uint64_t));
538 if ( !stats->quals_1st )
539 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
540 memset(stats->quals_1st + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
542 stats->quals_2nd = realloc(stats->quals_2nd, n*stats->nquals*sizeof(uint64_t));
543 if ( !stats->quals_2nd )
544 error("Could not realloc buffers, the sequence too long: %d (2x%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
545 memset(stats->quals_2nd + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
547 if ( stats->mpc_buf )
549 stats->mpc_buf = realloc(stats->mpc_buf, n*stats->nquals*sizeof(uint64_t));
550 if ( !stats->mpc_buf )
551 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
552 memset(stats->mpc_buf + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
555 stats->acgt_cycles = realloc(stats->acgt_cycles, n*4*sizeof(uint64_t));
556 if ( !stats->acgt_cycles )
557 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*4*sizeof(uint64_t));
558 memset(stats->acgt_cycles + stats->nbases*4, 0, (n-stats->nbases)*4*sizeof(uint64_t));
560 stats->read_lengths = realloc(stats->read_lengths, n*sizeof(uint64_t));
561 if ( !stats->read_lengths )
562 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
563 memset(stats->read_lengths + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
565 stats->insertions = realloc(stats->insertions, n*sizeof(uint64_t));
566 if ( !stats->insertions )
567 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
568 memset(stats->insertions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
570 stats->deletions = realloc(stats->deletions, n*sizeof(uint64_t));
571 if ( !stats->deletions )
572 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
573 memset(stats->deletions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
575 stats->ins_cycles_1st = realloc(stats->ins_cycles_1st, (n+1)*sizeof(uint64_t));
576 if ( !stats->ins_cycles_1st )
577 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
578 memset(stats->ins_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
580 stats->ins_cycles_2nd = realloc(stats->ins_cycles_2nd, (n+1)*sizeof(uint64_t));
581 if ( !stats->ins_cycles_2nd )
582 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
583 memset(stats->ins_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
585 stats->del_cycles_1st = realloc(stats->del_cycles_1st, (n+1)*sizeof(uint64_t));
586 if ( !stats->del_cycles_1st )
587 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
588 memset(stats->del_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
590 stats->del_cycles_2nd = realloc(stats->del_cycles_2nd, (n+1)*sizeof(uint64_t));
591 if ( !stats->del_cycles_2nd )
592 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
593 memset(stats->del_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
597 // Realloc the coverage distribution buffer
598 int *rbuffer = calloc(sizeof(int),seq_len*5);
599 n = stats->cov_rbuf.size-stats->cov_rbuf.start;
600 memcpy(rbuffer,stats->cov_rbuf.buffer+stats->cov_rbuf.start,n);
601 if ( stats->cov_rbuf.start>1 )
602 memcpy(rbuffer+n,stats->cov_rbuf.buffer,stats->cov_rbuf.start);
603 stats->cov_rbuf.start = 0;
604 free(stats->cov_rbuf.buffer);
605 stats->cov_rbuf.buffer = rbuffer;
606 stats->cov_rbuf.size = seq_len*5;
608 realloc_rseq_buffer(stats);
611 void collect_stats(bam1_t *bam_line, stats_t *stats)
613 if ( stats->rg_hash )
615 const uint8_t *rg = bam_aux_get(bam_line, "RG");
617 khiter_t k = kh_get(kh_rg, stats->rg_hash, (const char*)(rg + 1));
618 if ( k == kh_end(stats->rg_hash) ) return;
620 if ( stats->flag_require && (bam_line->core.flag & stats->flag_require)!=stats->flag_require )
622 if ( stats->flag_filter && (bam_line->core.flag & stats->flag_filter) )
625 if ( !is_in_regions(bam_line,stats) )
628 int seq_len = bam_line->core.l_qseq;
629 if ( !seq_len ) return;
630 if ( stats->filter_readlen!=-1 && seq_len!=stats->filter_readlen ) return;
631 if ( seq_len >= stats->nbases )
632 realloc_buffers(stats,seq_len);
633 if ( stats->max_len<seq_len )
634 stats->max_len = seq_len;
636 stats->read_lengths[seq_len]++;
638 // Count GC and ACGT per cycle
639 uint8_t base, *seq = bam1_seq(bam_line);
642 int reverse = IS_REVERSE(bam_line);
643 for (i=0; i<seq_len; i++)
645 // Conversion from uint8_t coding to ACGT
649 base = bam1_seqi(seq,i);
651 if ( base==1 || base==2 ) gc_count++;
652 else if ( base>2 ) base=3;
653 if ( 4*(reverse ? seq_len-i-1 : i) + base >= stats->nbases*4 )
654 error("FIXME: acgt_cycles\n");
655 stats->acgt_cycles[ 4*(reverse ? seq_len-i-1 : i) + base ]++;
657 int gc_idx_min = gc_count*(stats->ngc-1)/seq_len;
658 int gc_idx_max = (gc_count+1)*(stats->ngc-1)/seq_len;
659 if ( gc_idx_max >= stats->ngc ) gc_idx_max = stats->ngc - 1;
661 // Determine which array (1st or 2nd read) will these stats go to,
662 // trim low quality bases from end the same way BWA does,
665 uint8_t *bam_quals = bam1_qual(bam_line);
666 if ( bam_line->core.flag&BAM_FREAD2 )
668 quals = stats->quals_2nd;
670 for (i=gc_idx_min; i<gc_idx_max; i++)
675 quals = stats->quals_1st;
677 for (i=gc_idx_min; i<gc_idx_max; i++)
680 if ( stats->trim_qual>0 )
681 stats->nbases_trimmed += bwa_trim_read(stats->trim_qual, bam_quals, seq_len, reverse);
683 // Quality histogram and average quality
684 for (i=0; i<seq_len; i++)
686 uint8_t qual = bam_quals[ reverse ? seq_len-i-1 : i];
687 if ( qual>=stats->nquals )
688 error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
689 if ( qual>stats->max_qual )
690 stats->max_qual = qual;
692 quals[ i*stats->nquals+qual ]++;
693 stats->sum_qual += qual;
696 // Look at the flags and increment appropriate counters (mapped, paired, etc)
697 if ( IS_UNMAPPED(bam_line) )
698 stats->nreads_unmapped++;
701 if ( !bam_line->core.qual )
704 count_indels(stats,bam_line);
706 if ( !IS_PAIRED(bam_line) )
707 stats->nreads_unpaired++;
710 stats->nreads_paired++;
712 if ( bam_line->core.tid!=bam_line->core.mtid )
713 stats->nreads_anomalous++;
715 // The insert size is tricky, because for long inserts the libraries are
716 // prepared differently and the pairs point in other direction. BWA does
717 // not set the paired flag for them. Similar thing is true also for 454
718 // reads. Mates mapped to different chromosomes have isize==0.
719 int32_t isize = bam_line->core.isize;
720 if ( isize<0 ) isize = -isize;
721 if ( isize >= stats->nisize )
722 isize = stats->nisize-1;
723 if ( isize>0 || bam_line->core.tid==bam_line->core.mtid )
725 int pos_fst = bam_line->core.mpos - bam_line->core.pos;
726 int is_fst = IS_READ1(bam_line) ? 1 : -1;
727 int is_fwd = IS_REVERSE(bam_line) ? -1 : 1;
728 int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1;
730 if ( is_fwd*is_mfwd>0 )
731 stats->isize_other[isize]++;
732 else if ( is_fst*pos_fst>0 )
734 if ( is_fst*is_fwd>0 )
735 stats->isize_inward[isize]++;
737 stats->isize_outward[isize]++;
739 else if ( is_fst*pos_fst<0 )
741 if ( is_fst*is_fwd>0 )
742 stats->isize_outward[isize]++;
744 stats->isize_inward[isize]++;
749 // Number of mismatches
750 uint8_t *nm = bam_aux_get(bam_line,"NM");
752 stats->nmismatches += bam_aux2i(nm);
754 // Number of mapped bases from cigar
755 // Conversion from uint32_t to MIDNSHP
758 if ( bam_line->core.n_cigar == 0)
759 error("FIXME: mapped read with no cigar?\n");
761 if ( stats->regions )
763 // Count only on-target bases
764 int iref = bam_line->core.pos + 1;
765 for (i=0; i<bam_line->core.n_cigar; i++)
767 int cig = bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK;
768 int ncig = bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
769 if ( cig==2 ) readlen += ncig;
772 if ( iref < stats->reg_from ) ncig -= stats->reg_from-iref;
773 else if ( iref+ncig-1 > stats->reg_to ) ncig -= iref+ncig-1 - stats->reg_to;
774 if ( ncig<0 ) ncig = 0;
775 stats->nbases_mapped_cigar += ncig;
776 iref += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
781 if ( iref>=stats->reg_from && iref<=stats->reg_to )
782 stats->nbases_mapped_cigar += ncig;
788 // Count the whole read
789 for (i=0; i<bam_line->core.n_cigar; i++)
791 if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==0 || (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==1 )
792 stats->nbases_mapped_cigar += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
793 if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==2 )
794 readlen += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
797 stats->nbases_mapped += seq_len;
799 if ( stats->tid==bam_line->core.tid && bam_line->core.pos<stats->pos )
800 stats->is_sorted = 0;
801 stats->pos = bam_line->core.pos;
803 if ( stats->is_sorted )
805 if ( stats->tid==-1 || stats->tid!=bam_line->core.tid )
806 round_buffer_flush(stats,-1);
808 // Mismatches per cycle and GC-depth graph. For simplicity, reads overlapping GCD bins
809 // are not splitted which results in up to seq_len-1 overlaps. The default bin size is
810 // 20kbp, so the effect is negligible.
813 int inc_ref = 0, inc_gcd = 0;
814 // First pass or new chromosome
815 if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid ) { inc_ref=1; inc_gcd=1; }
816 // Read goes beyond the end of the rseq buffer
817 else if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+readlen ) { inc_ref=1; inc_gcd=1; }
818 // Read overlaps the next gcd bin
819 else if ( stats->gcd_pos+stats->gcd_bin_size < bam_line->core.pos+readlen )
822 if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+stats->gcd_bin_size ) inc_ref = 1;
827 if ( stats->igcd >= stats->ngcd )
828 realloc_gcd_buffer(stats, readlen);
830 read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos);
831 stats->gcd_pos = bam_line->core.pos;
832 stats->gcd[ stats->igcd ].gc = fai_gc_content(stats, stats->gcd_pos, stats->gcd_bin_size);
835 count_mismatches_per_cycle(stats,bam_line);
837 // No reference and first pass, new chromosome or sequence going beyond the end of the gcd bin
838 else if ( stats->gcd_pos==-1 || stats->tid != bam_line->core.tid || bam_line->core.pos - stats->gcd_pos > stats->gcd_bin_size )
840 // First pass or a new chromosome
841 stats->tid = bam_line->core.tid;
842 stats->gcd_pos = bam_line->core.pos;
844 if ( stats->igcd >= stats->ngcd )
845 realloc_gcd_buffer(stats, readlen);
847 stats->gcd[ stats->igcd ].depth++;
848 // When no reference sequence is given, approximate the GC from the read (much shorter window, but otherwise OK)
850 stats->gcd[ stats->igcd ].gc += (float) gc_count / seq_len;
852 // Coverage distribution graph
853 round_buffer_flush(stats,bam_line->core.pos);
854 round_buffer_insert_read(&(stats->cov_rbuf),bam_line->core.pos,bam_line->core.pos+seq_len-1);
858 stats->total_len += seq_len;
859 if ( IS_DUP(bam_line) )
861 stats->total_len_dup += seq_len;
866 // Sort by GC and depth
867 #define GCD_t(x) ((gc_depth_t *)x)
868 static int gcd_cmp(const void *a, const void *b)
870 if ( GCD_t(a)->gc < GCD_t(b)->gc ) return -1;
871 if ( GCD_t(a)->gc > GCD_t(b)->gc ) return 1;
872 if ( GCD_t(a)->depth < GCD_t(b)->depth ) return -1;
873 if ( GCD_t(a)->depth > GCD_t(b)->depth ) return 1;
878 float gcd_percentile(gc_depth_t *gcd, int N, int p)
888 return gcd[N-1].depth;
891 return gcd[k-1].depth + d*(gcd[k].depth - gcd[k-1].depth);
894 void output_stats(stats_t *stats)
896 // Calculate average insert size and standard deviation (from the main bulk data only)
898 uint64_t nisize=0, nisize_inward=0, nisize_outward=0, nisize_other=0;
899 for (isize=0; isize<stats->nisize; isize++)
901 // Each pair was counted twice
902 stats->isize_inward[isize] *= 0.5;
903 stats->isize_outward[isize] *= 0.5;
904 stats->isize_other[isize] *= 0.5;
906 nisize_inward += stats->isize_inward[isize];
907 nisize_outward += stats->isize_outward[isize];
908 nisize_other += stats->isize_other[isize];
909 nisize += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
912 double bulk=0, avg_isize=0, sd_isize=0;
913 for (isize=0; isize<stats->nisize; isize++)
915 bulk += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
916 avg_isize += isize * (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]);
918 if ( bulk/nisize > stats->isize_main_bulk )
925 avg_isize /= nisize ? nisize : 1;
926 for (isize=1; isize<ibulk; isize++)
927 sd_isize += (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]) * (isize-avg_isize)*(isize-avg_isize) / nisize;
928 sd_isize = sqrt(sd_isize);
931 printf("# This file was produced by bamcheck (%s)\n",BAMCHECK_VERSION);
932 printf("# The command line was: %s",stats->argv[0]);
934 for (i=1; i<stats->argc; i++)
935 printf(" %s",stats->argv[i]);
937 printf("# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n");
938 printf("SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd));
939 printf("SN\tis paired:\t%d\n", stats->nreads_1st&&stats->nreads_2nd ? 1 : 0);
940 printf("SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0);
941 printf("SN\t1st fragments:\t%ld\n", (long)stats->nreads_1st);
942 printf("SN\tlast fragments:\t%ld\n", (long)stats->nreads_2nd);
943 printf("SN\treads mapped:\t%ld\n", (long)(stats->nreads_paired+stats->nreads_unpaired));
944 printf("SN\treads unmapped:\t%ld\n", (long)stats->nreads_unmapped);
945 printf("SN\treads unpaired:\t%ld\n", (long)stats->nreads_unpaired);
946 printf("SN\treads paired:\t%ld\n", (long)stats->nreads_paired);
947 printf("SN\treads duplicated:\t%ld\n", (long)stats->nreads_dup);
948 printf("SN\treads MQ0:\t%ld\n", (long)stats->nreads_mq0);
949 printf("SN\ttotal length:\t%ld\n", (long)stats->total_len);
950 printf("SN\tbases mapped:\t%ld\n", (long)stats->nbases_mapped);
951 printf("SN\tbases mapped (cigar):\t%ld\n", (long)stats->nbases_mapped_cigar);
952 printf("SN\tbases trimmed:\t%ld\n", (long)stats->nbases_trimmed);
953 printf("SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup);
954 printf("SN\tmismatches:\t%ld\n", (long)stats->nmismatches);
955 printf("SN\terror rate:\t%e\n", (float)stats->nmismatches/stats->nbases_mapped_cigar);
956 float avg_read_length = (stats->nreads_1st+stats->nreads_2nd)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd):0;
957 printf("SN\taverage length:\t%.0f\n", avg_read_length);
958 printf("SN\tmaximum length:\t%d\n", stats->max_len);
959 printf("SN\taverage quality:\t%.1f\n", stats->total_len?stats->sum_qual/stats->total_len:0);
960 printf("SN\tinsert size average:\t%.1f\n", avg_isize);
961 printf("SN\tinsert size standard deviation:\t%.1f\n", sd_isize);
962 printf("SN\tinward oriented pairs:\t%ld\n", (long)nisize_inward);
963 printf("SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward);
964 printf("SN\tpairs with other orientation:\t%ld\n", (long)nisize_other);
965 printf("SN\tpairs on different chromosomes:\t%ld\n", (long)stats->nreads_anomalous/2);
968 if ( stats->max_len<stats->nbases ) stats->max_len++;
969 if ( stats->max_qual+1<stats->nquals ) stats->max_qual++;
970 printf("# First Fragment Qualitites. Use `grep ^FFQ | cut -f 2-` to extract this part.\n");
971 printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
972 for (ibase=0; ibase<stats->max_len; ibase++)
974 printf("FFQ\t%d",ibase+1);
975 for (iqual=0; iqual<=stats->max_qual; iqual++)
977 printf("\t%ld", (long)stats->quals_1st[ibase*stats->nquals+iqual]);
981 printf("# Last Fragment Qualitites. Use `grep ^LFQ | cut -f 2-` to extract this part.\n");
982 printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
983 for (ibase=0; ibase<stats->max_len; ibase++)
985 printf("LFQ\t%d",ibase+1);
986 for (iqual=0; iqual<=stats->max_qual; iqual++)
988 printf("\t%ld", (long)stats->quals_2nd[ibase*stats->nquals+iqual]);
992 if ( stats->mpc_buf )
994 printf("# Mismatches per cycle and quality. Use `grep ^MPC | cut -f 2-` to extract this part.\n");
995 printf("# Columns correspond to qualities, rows to cycles. First column is the cycle number, second\n");
996 printf("# is the number of N's and the rest is the number of mismatches\n");
997 for (ibase=0; ibase<stats->max_len; ibase++)
999 printf("MPC\t%d",ibase+1);
1000 for (iqual=0; iqual<=stats->max_qual; iqual++)
1002 printf("\t%ld", (long)stats->mpc_buf[ibase*stats->nquals+iqual]);
1007 printf("# GC Content of first fragments. Use `grep ^GCF | cut -f 2-` to extract this part.\n");
1009 for (ibase=0; ibase<stats->ngc; ibase++)
1011 if ( stats->gc_1st[ibase]==stats->gc_1st[ibase_prev] ) continue;
1012 printf("GCF\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_1st[ibase_prev]);
1015 printf("# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part.\n");
1017 for (ibase=0; ibase<stats->ngc; ibase++)
1019 if ( stats->gc_2nd[ibase]==stats->gc_2nd[ibase_prev] ) continue;
1020 printf("GCL\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_2nd[ibase_prev]);
1023 printf("# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle, and A,C,G,T counts [%%]\n");
1024 for (ibase=0; ibase<stats->max_len; ibase++)
1026 uint64_t *ptr = &(stats->acgt_cycles[ibase*4]);
1027 uint64_t sum = ptr[0]+ptr[1]+ptr[2]+ptr[3];
1028 if ( ! sum ) continue;
1029 printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum);
1031 printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n");
1032 for (isize=0; isize<ibulk; isize++)
1033 printf("IS\t%d\t%ld\t%ld\t%ld\t%ld\n", isize, (long)(stats->isize_inward[isize]+stats->isize_outward[isize]+stats->isize_other[isize]),
1034 (long)stats->isize_inward[isize], (long)stats->isize_outward[isize], (long)stats->isize_other[isize]);
1036 printf("# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count\n");
1038 for (ilen=0; ilen<stats->max_len; ilen++)
1040 if ( stats->read_lengths[ilen]>0 )
1041 printf("RL\t%d\t%ld\n", ilen, (long)stats->read_lengths[ilen]);
1044 printf("# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n");
1045 for (ilen=0; ilen<stats->nindels; ilen++)
1047 if ( stats->insertions[ilen]>0 || stats->deletions[ilen]>0 )
1048 printf("ID\t%d\t%ld\t%ld\n", ilen+1, (long)stats->insertions[ilen], (long)stats->deletions[ilen]);
1051 printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev)\n");
1052 for (ilen=0; ilen<=stats->nbases; ilen++)
1054 // For deletions we print the index of the cycle before the deleted base (1-based) and for insertions
1055 // the index of the cycle of the first inserted base (also 1-based)
1056 if ( stats->ins_cycles_1st[ilen]>0 || stats->ins_cycles_2nd[ilen]>0 || stats->del_cycles_1st[ilen]>0 || stats->del_cycles_2nd[ilen]>0 )
1057 printf("IC\t%d\t%ld\t%ld\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles_1st[ilen], (long)stats->ins_cycles_2nd[ilen], (long)stats->del_cycles_1st[ilen], (long)stats->del_cycles_2nd[ilen]);
1060 printf("# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n");
1061 if ( stats->cov[0] )
1062 printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1, (long)stats->cov[0]);
1064 for (icov=1; icov<stats->ncov-1; icov++)
1065 if ( stats->cov[icov] )
1066 printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1, (long)stats->cov[icov]);
1067 if ( stats->cov[stats->ncov-1] )
1068 printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1, (long)stats->cov[stats->ncov-1]);
1070 // Calculate average GC content, then sort by GC and depth
1071 printf("# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n");
1073 for (igcd=0; igcd<stats->igcd; igcd++)
1076 stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc);
1078 if ( stats->gcd[igcd].depth )
1079 stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc / stats->gcd[igcd].depth);
1081 qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp);
1083 while ( igcd < stats->igcd )
1085 // Calculate percentiles (10,25,50,75,90th) for the current GC content and print
1086 uint32_t nbins=0, itmp=igcd;
1087 float gc = stats->gcd[igcd].gc;
1088 while ( itmp<stats->igcd && fabs(stats->gcd[itmp].gc-gc)<0.1 )
1093 printf("GCD\t%.1f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", gc, (igcd+nbins+1)*100./(stats->igcd+1),
1094 gcd_percentile(&(stats->gcd[igcd]),nbins,10) *avg_read_length/stats->gcd_bin_size,
1095 gcd_percentile(&(stats->gcd[igcd]),nbins,25) *avg_read_length/stats->gcd_bin_size,
1096 gcd_percentile(&(stats->gcd[igcd]),nbins,50) *avg_read_length/stats->gcd_bin_size,
1097 gcd_percentile(&(stats->gcd[igcd]),nbins,75) *avg_read_length/stats->gcd_bin_size,
1098 gcd_percentile(&(stats->gcd[igcd]),nbins,90) *avg_read_length/stats->gcd_bin_size
1104 size_t mygetline(char **line, size_t *n, FILE *fp)
1106 if (line == NULL || n == NULL || fp == NULL)
1111 if (*n==0 || !*line)
1119 while ((c=getc(fp))!= EOF && c!='\n')
1124 *line = realloc(*line, sizeof(char)*(*n));
1126 (*line)[nread-1] = c;
1131 *line = realloc(*line, sizeof(char)*(*n));
1134 return nread>0 ? nread : -1;
1138 void init_regions(stats_t *stats, char *file)
1141 khash_t(kh_bam_tid) *header_hash;
1143 bam_init_header_hash(stats->sam->header);
1144 header_hash = (khash_t(kh_bam_tid)*)stats->sam->header->hash;
1146 FILE *fp = fopen(file,"r");
1147 if ( !fp ) error("%s: %s\n",file,strerror(errno));
1153 int prev_tid=-1, prev_pos=-1;
1154 while ((nread = mygetline(&line, &len, fp)) != -1)
1156 if ( line[0] == '#' ) continue;
1159 while ( i<nread && !isspace(line[i]) ) i++;
1160 if ( i>=nread ) error("Could not parse the file: %s [%s]\n", file,line);
1163 iter = kh_get(kh_bam_tid, header_hash, line);
1164 int tid = kh_val(header_hash, iter);
1165 if ( iter == kh_end(header_hash) )
1168 fprintf(stderr,"Warning: Some sequences not present in the BAM, e.g. \"%s\". This message is printed only once.\n", line);
1173 if ( tid >= stats->nregions )
1175 stats->regions = realloc(stats->regions,sizeof(regions_t)*(stats->nregions+100));
1177 for (j=stats->nregions; j<stats->nregions+100; j++)
1179 stats->regions[j].npos = stats->regions[j].mpos = stats->regions[j].cpos = 0;
1180 stats->regions[j].pos = NULL;
1182 stats->nregions += 100;
1184 int npos = stats->regions[tid].npos;
1185 if ( npos >= stats->regions[tid].mpos )
1187 stats->regions[tid].mpos += 1000;
1188 stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos);
1191 if ( (sscanf(line+i+1,"%d %d",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n");
1192 if ( prev_tid==-1 || prev_tid!=tid )
1195 prev_pos = stats->regions[tid].pos[npos].from;
1197 if ( prev_pos>stats->regions[tid].pos[npos].from )
1198 error("The positions are not in chromosomal order (%s:%d comes after %d)\n", line,stats->regions[tid].pos[npos].from,prev_pos);
1199 stats->regions[tid].npos++;
1201 if (line) free(line);
1202 if ( !stats->regions ) error("Unable to map the -t sequences to the BAM sequences.\n");
1206 void destroy_regions(stats_t *stats)
1209 for (i=0; i<stats->nregions; i++)
1211 if ( !stats->regions[i].mpos ) continue;
1212 free(stats->regions[i].pos);
1214 if ( stats->regions ) free(stats->regions);
1217 static int fetch_read(const bam1_t *bam_line, void *data)
1219 collect_stats((bam1_t*)bam_line,(stats_t*)data);
1223 void reset_regions(stats_t *stats)
1226 for (i=0; i<stats->nregions; i++)
1227 stats->regions[i].cpos = 0;
1230 int is_in_regions(bam1_t *bam_line, stats_t *stats)
1232 if ( !stats->regions ) return 1;
1234 if ( bam_line->core.tid >= stats->nregions || bam_line->core.tid<0 ) return 0;
1235 if ( !stats->is_sorted ) error("The BAM must be sorted in order for -t to work.\n");
1237 regions_t *reg = &stats->regions[bam_line->core.tid];
1238 if ( reg->cpos==reg->npos ) return 0; // done for this chr
1240 // Find a matching interval or skip this read. No splicing of reads is done, no indels or soft clips considered,
1241 // even small overlap is enough to include the read in the stats.
1243 while ( i<reg->npos && reg->pos[i].to<=bam_line->core.pos ) i++;
1244 if ( i>=reg->npos ) { reg->cpos = reg->npos; return 0; }
1245 if ( bam_line->core.pos + bam_line->core.l_qseq + 1 < reg->pos[i].from ) return 0;
1247 stats->reg_from = reg->pos[i].from;
1248 stats->reg_to = reg->pos[i].to;
1253 void init_group_id(stats_t *stats, char *id)
1255 if ( !stats->sam->header->dict )
1256 stats->sam->header->dict = sam_header_parse2(stats->sam->header->text);
1257 void *iter = stats->sam->header->dict;
1258 const char *key, *val;
1260 stats->rg_hash = kh_init(kh_rg);
1261 while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) )
1263 if ( !strcmp(id,key) || (val && !strcmp(id,val)) )
1265 khiter_t k = kh_get(kh_rg, stats->rg_hash, key);
1266 if ( k != kh_end(stats->rg_hash) )
1267 fprintf(stderr, "[init_group_id] The group ID not unique: \"%s\"\n", key);
1269 k = kh_put(kh_rg, stats->rg_hash, key, &ret);
1270 kh_value(stats->rg_hash, k) = val;
1275 error("The sample or read group \"%s\" not present.\n", id);
1279 void error(const char *format, ...)
1283 printf("Version: %s\n", BAMCHECK_VERSION);
1284 printf("About: The program collects statistics from BAM files. The output can be visualized using plot-bamcheck.\n");
1285 printf("Usage: bamcheck [OPTIONS] file.bam\n");
1286 printf(" bamcheck [OPTIONS] file.bam chr:from-to\n");
1287 printf("Options:\n");
1288 printf(" -c, --coverage <int>,<int>,<int> Coverage distribution min,max,step [1,1000,1]\n");
1289 printf(" -d, --remove-dups Exlude from statistics reads marked as duplicates\n");
1290 printf(" -f, --required-flag <int> Required flag, 0 for unset [0]\n");
1291 printf(" -F, --filtering-flag <int> Filtering flag, 0 for unset [0]\n");
1292 printf(" --GC-depth <float,float> Bin size for GC-depth graph and the maximum reference length [2e4,4.2e9]\n");
1293 printf(" -h, --help This help message\n");
1294 printf(" -i, --insert-size <int> Maximum insert size [8000]\n");
1295 printf(" -I, --id <string> Include only listed read group or sample name\n");
1296 printf(" -l, --read-length <int> Include in the statistics only reads with the given read length []\n");
1297 printf(" -m, --most-inserts <float> Report only the main part of inserts [0.99]\n");
1298 printf(" -q, --trim-quality <int> The BWA trimming parameter [0]\n");
1299 printf(" -r, --ref-seq <file> Reference sequence (required for GC-depth calculation).\n");
1300 printf(" -t, --target-regions <file> Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n");
1301 printf(" -s, --sam Input is SAM\n");
1307 va_start(ap, format);
1308 vfprintf(stderr, format, ap);
1314 int main(int argc, char *argv[])
1316 char *targets = NULL;
1317 char *bam_fname = NULL;
1318 char *group_id = NULL;
1319 samfile_t *sam = NULL;
1322 stats_t *stats = calloc(1,sizeof(stats_t));
1324 stats->nquals = 256;
1325 stats->nbases = 300;
1326 stats->nisize = 8000;
1327 stats->max_len = 30;
1328 stats->max_qual = 40;
1329 stats->isize_main_bulk = 0.99; // There are always outliers at the far end
1330 stats->gcd_bin_size = 20e3;
1331 stats->gcd_ref_size = 4.2e9;
1332 stats->rseq_pos = -1;
1333 stats->tid = stats->gcd_pos = -1;
1335 stats->is_sorted = 1;
1337 stats->cov_max = 1000;
1338 stats->cov_step = 1;
1341 stats->filter_readlen = -1;
1342 stats->nindels = stats->nbases;
1344 strcpy(in_mode, "rb");
1346 static struct option loptions[] =
1349 {"remove-dups",0,0,'d'},
1351 {"ref-seq",1,0,'r'},
1352 {"coverage",1,0,'c'},
1353 {"read-length",1,0,'l'},
1354 {"insert-size",1,0,'i'},
1355 {"most-inserts",1,0,'m'},
1356 {"trim-quality",1,0,'q'},
1357 {"target-regions",0,0,'t'},
1358 {"required-flag",1,0,'f'},
1359 {"filtering-flag",0,0,'F'},
1365 while ( (opt=getopt_long(argc,argv,"?hdsr:c:l:i:t:m:q:f:F:I:1:",loptions,NULL))>0 )
1369 case 'f': stats->flag_require=strtol(optarg,0,0); break;
1370 case 'F': stats->flag_filter=strtol(optarg,0,0); break;
1371 case 'd': stats->flag_filter|=BAM_FDUP; break;
1372 case 's': strcpy(in_mode, "r"); break;
1373 case 'r': stats->fai = fai_load(optarg);
1375 error("Could not load faidx: %s\n", optarg);
1379 if ( sscanf(optarg,"%f,%f",&fbin,&flen)!= 2 )
1380 error("Unable to parse --GC-depth %s\n", optarg);
1381 stats->gcd_bin_size = fbin;
1382 stats->gcd_ref_size = flen;
1385 case 'c': if ( sscanf(optarg,"%d,%d,%d",&stats->cov_min,&stats->cov_max,&stats->cov_step)!= 3 )
1386 error("Unable to parse -c %s\n", optarg);
1388 case 'l': stats->filter_readlen = atoi(optarg); break;
1389 case 'i': stats->nisize = atoi(optarg); break;
1390 case 'm': stats->isize_main_bulk = atof(optarg); break;
1391 case 'q': stats->trim_qual = atoi(optarg); break;
1392 case 't': targets = optarg; break;
1393 case 'I': group_id = optarg; break;
1395 case 'h': error(NULL);
1396 default: error("Unknown argument: %s\n", optarg);
1400 bam_fname = argv[optind++];
1404 if ( isatty(fileno((FILE *)stdin)) )
1410 // .. coverage bins and round buffer
1411 if ( stats->cov_step > stats->cov_max - stats->cov_min + 1 )
1413 stats->cov_step = stats->cov_max - stats->cov_min;
1414 if ( stats->cov_step <= 0 )
1415 stats->cov_step = 1;
1417 stats->ncov = 3 + (stats->cov_max-stats->cov_min) / stats->cov_step;
1418 stats->cov_max = stats->cov_min + ((stats->cov_max-stats->cov_min)/stats->cov_step +1)*stats->cov_step - 1;
1419 stats->cov = calloc(sizeof(uint64_t),stats->ncov);
1420 stats->cov_rbuf.size = stats->nbases*5;
1421 stats->cov_rbuf.buffer = calloc(sizeof(int32_t),stats->cov_rbuf.size);
1423 if ((sam = samopen(bam_fname, in_mode, NULL)) == 0)
1424 error("Failed to open: %s\n", bam_fname);
1426 if ( group_id ) init_group_id(stats, group_id);
1427 bam1_t *bam_line = bam_init1();
1429 stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
1430 stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
1431 stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t));
1432 stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t));
1433 stats->isize_inward = calloc(stats->nisize,sizeof(uint64_t));
1434 stats->isize_outward = calloc(stats->nisize,sizeof(uint64_t));
1435 stats->isize_other = calloc(stats->nisize,sizeof(uint64_t));
1436 stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t));
1437 stats->mpc_buf = stats->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL;
1438 stats->acgt_cycles = calloc(4*stats->nbases,sizeof(uint64_t));
1439 stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t));
1440 stats->insertions = calloc(stats->nbases,sizeof(uint64_t));
1441 stats->deletions = calloc(stats->nbases,sizeof(uint64_t));
1442 stats->ins_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
1443 stats->ins_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
1444 stats->del_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
1445 stats->del_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
1446 realloc_rseq_buffer(stats);
1448 init_regions(stats, targets);
1450 // Collect statistics
1453 // Collect stats in selected regions only
1454 bam_index_t *bam_idx = bam_index_load(bam_fname);
1456 error("Random alignment retrieval only works for indexed BAM files.\n");
1459 for (i=optind; i<argc; i++)
1462 bam_parse_region(stats->sam->header, argv[i], &tid, &beg, &end);
1463 if ( tid < 0 ) continue;
1464 reset_regions(stats);
1465 bam_fetch(stats->sam->x.bam, bam_idx, tid, beg, end, stats, fetch_read);
1467 bam_index_destroy(bam_idx);
1471 // Stream through the entire BAM ignoring off-target regions if -t is given
1472 while (samread(sam,bam_line) >= 0)
1473 collect_stats(bam_line,stats);
1475 round_buffer_flush(stats,-1);
1477 output_stats(stats);
1479 bam_destroy1(bam_line);
1480 samclose(stats->sam);
1481 if (stats->fai) fai_destroy(stats->fai);
1482 free(stats->cov_rbuf.buffer); free(stats->cov);
1483 free(stats->quals_1st); free(stats->quals_2nd);
1484 free(stats->gc_1st); free(stats->gc_2nd);
1485 free(stats->isize_inward); free(stats->isize_outward); free(stats->isize_other);
1487 free(stats->rseq_buf);
1488 free(stats->mpc_buf);
1489 free(stats->acgt_cycles);
1490 free(stats->read_lengths);
1491 free(stats->insertions);
1492 free(stats->deletions);
1493 free(stats->ins_cycles_1st);
1494 free(stats->ins_cycles_2nd);
1495 free(stats->del_cycles_1st);
1496 free(stats->del_cycles_2nd);
1497 destroy_regions(stats);
1499 if ( stats->rg_hash ) kh_destroy(kh_rg, stats->rg_hash);