2 Author: petr.danecek@sanger
3 gcc -Wall -Winline -g -O2 -I ~/git/samtools bamcheck.c -o bamcheck -lm -lz -L ~/git/samtools -lbam
5 Assumptions, approximations and other issues:
6 - GC-depth graph does not split reads, the starting position determines which bin is incremented.
7 There are small overlaps between bins (max readlen-1). However, the bins are big (20k).
8 - coverage distribution ignores softclips and deletions
9 - some stats require sorted BAMs
10 - GC content graph can have an untidy, step-like pattern when BAM contains multiple read lengths.
11 - The whole reads are used with -t, no splicing is done, no indels or soft clips are
12 considered, even small overlap is good enough to include the read in the stats.
15 #define BAMCHECK_VERSION "2012-04-04"
17 #define _ISOC99_SOURCE
31 #define BWA_MIN_RDLEN 35
32 #define IS_PAIRED(bam) ((bam)->core.flag&BAM_FPAIRED && !((bam)->core.flag&BAM_FUNMAP) && !((bam)->core.flag&BAM_FMUNMAP))
33 #define IS_UNMAPPED(bam) ((bam)->core.flag&BAM_FUNMAP)
34 #define IS_REVERSE(bam) ((bam)->core.flag&BAM_FREVERSE)
35 #define IS_MATE_REVERSE(bam) ((bam)->core.flag&BAM_FMREVERSE)
36 #define IS_READ1(bam) ((bam)->core.flag&BAM_FREAD1)
37 #define IS_READ2(bam) ((bam)->core.flag&BAM_FREAD2)
38 #define IS_DUP(bam) ((bam)->core.flag&BAM_FDUP)
42 int32_t line_len, line_blen;
47 KHASH_MAP_INIT_STR(s, faidx1_t)
48 KHASH_MAP_INIT_STR(str, int)
63 // For coverage distribution, a simple pileup
72 typedef struct { uint32_t from, to; } pos_t;
83 int trim_qual; // bwa trim quality
84 int rmdup; // Exclude reads marked as duplicates from the stats
86 // Dimensions of the quality histogram holder (quals_1st,quals_2nd), GC content holder (gc_1st,gc_2nd),
87 // insert size histogram holder
88 int nquals; // The number of quality bins
89 int nbases; // The maximum sequence length the allocated array can hold
90 int nisize; // The maximum insert size that the allocated array can hold
91 int ngc; // The size of gc_1st and gc_2nd
92 int nindels; // The maximum indel length for indel distribution
94 // Arrays for the histogram data
95 uint64_t *quals_1st, *quals_2nd;
96 uint64_t *gc_1st, *gc_2nd;
97 uint64_t *isize_inward, *isize_outward, *isize_other;
98 uint64_t *acgt_cycles;
99 uint64_t *read_lengths;
100 uint64_t *insertions, *deletions;
101 uint64_t *ins_cycles, *del_cycles;
103 // The extremes encountered
104 int max_len; // Maximum read length
105 int max_qual; // Maximum quality
106 float isize_main_bulk; // There are always some unrealistically big insert sizes, report only the main part
111 uint64_t total_len_dup;
115 uint64_t nreads_unmapped;
116 uint64_t nreads_unpaired;
117 uint64_t nreads_paired;
119 uint64_t nbases_mapped;
120 uint64_t nbases_mapped_cigar;
121 uint64_t nbases_trimmed; // bwa trimmed bases
122 uint64_t nmismatches;
124 // GC-depth related data
125 uint32_t ngcd, igcd; // The maximum number of GC depth bins and index of the current bin
126 gc_depth_t *gcd; // The GC-depth bins holder
127 int gcd_bin_size; // The size of GC-depth bin
128 int32_t tid, gcd_pos; // Position of the current bin
129 int32_t pos; // Position of the last read
131 // Coverage distribution related data
132 int ncov; // The number of coverage bins
133 uint64_t *cov; // The coverage frequencies
134 int cov_min,cov_max,cov_step; // Minimum, maximum coverage and size of the coverage bins
135 round_buffer_t cov_rbuf; // Pileup round buffer
137 // Mismatches by read cycle
138 uint8_t *rseq_buf; // A buffer for reference sequence to check the mismatches against
139 int nref_seq; // The size of the buffer
140 int32_t rseq_pos; // The coordinate of the first base in the buffer
141 int32_t rseq_len; // The used part of the buffer
142 uint64_t *mpc_buf; // Mismatches per cycle
152 double sum_qual; // For calculating average quality value
153 samfile_t *sam; // Unused
154 faidx_t *fai; // Reference sequence for GC-depth graph
155 int argc; // Command line arguments to be printed on the output
160 void error(const char *format, ...);
162 // Coverage distribution methods
163 inline int coverage_idx(int min, int max, int n, int step, int depth)
171 return 1 + (depth - min) / step;
174 inline int round_buffer_lidx2ridx(int offset, int size, int64_t refpos, int64_t pos)
176 return (offset + (pos-refpos) % size) % size;
179 void round_buffer_flush(stats_t *stats, int64_t pos)
183 if ( pos==stats->cov_rbuf.pos )
186 int64_t new_pos = pos;
187 if ( pos==-1 || pos - stats->cov_rbuf.pos >= stats->cov_rbuf.size )
189 // Flush the whole buffer, but in sequential order,
190 pos = stats->cov_rbuf.pos + stats->cov_rbuf.size - 1;
193 if ( pos < stats->cov_rbuf.pos )
194 error("Expected coordinates in ascending order, got %ld after %ld\n", pos,stats->cov_rbuf.pos);
196 int ifrom = stats->cov_rbuf.start;
197 int ito = round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos-1);
200 for (ibuf=ifrom; ibuf<stats->cov_rbuf.size; ibuf++)
202 if ( !stats->cov_rbuf.buffer[ibuf] )
204 idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
206 stats->cov_rbuf.buffer[ibuf] = 0;
210 for (ibuf=ifrom; ibuf<=ito; ibuf++)
212 if ( !stats->cov_rbuf.buffer[ibuf] )
214 idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
216 stats->cov_rbuf.buffer[ibuf] = 0;
218 stats->cov_rbuf.start = (new_pos==-1) ? 0 : round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos);
219 stats->cov_rbuf.pos = new_pos;
222 void round_buffer_insert_read(round_buffer_t *rbuf, int64_t from, int64_t to)
224 if ( to-from >= rbuf->size )
225 error("The read length too big (%d), please increase the buffer length (currently %d)\n", to-from+1,rbuf->size);
226 if ( from < rbuf->pos )
227 error("The reads are not sorted (%ld comes after %ld).\n", from,rbuf->pos);
230 ifrom = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,from);
231 ito = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,to);
234 for (ibuf=ifrom; ibuf<rbuf->size; ibuf++)
235 rbuf->buffer[ibuf]++;
238 for (ibuf=ifrom; ibuf<=ito; ibuf++)
239 rbuf->buffer[ibuf]++;
242 // Calculate the number of bases in the read trimmed by BWA
243 int bwa_trim_read(int trim_qual, uint8_t *quals, int len, int reverse)
245 if ( len<BWA_MIN_RDLEN ) return 0;
247 // Although the name implies that the read cannot be trimmed to more than BWA_MIN_RDLEN,
248 // the calculation can in fact trim it to (BWA_MIN_RDLEN-1). (bwa_trim_read in bwa/bwaseqio.c).
249 int max_trimmed = len - BWA_MIN_RDLEN + 1;
250 int l, sum=0, max_sum=0, max_l=0;
252 for (l=0; l<max_trimmed; l++)
254 sum += trim_qual - quals[ reverse ? l : len-1-l ];
259 // This is the correct way, but bwa clips from some reason one base less
268 void count_indels(stats_t *stats,bam1_t *bam_line)
270 int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
273 int read_len = bam_line->core.l_qseq;
274 for (icig=0; icig<bam_line->core.n_cigar; icig++)
276 // Conversion from uint32_t to MIDNSHP
279 int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
280 int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
284 int idx = is_fwd ? icycle : read_len-icycle;
285 if ( idx<0 ) error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle);
286 if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d\n", idx,stats->nbases);
287 stats->ins_cycles[idx]++;
289 if ( ncig<=stats->nindels )
290 stats->insertions[ncig-1]++;
295 int idx = is_fwd ? icycle-1 : read_len-icycle-1;
296 if ( idx<0 ) continue; // discard meaningless deletions
297 if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases);
298 stats->del_cycles[idx]++;
299 if ( ncig<=stats->nindels )
300 stats->deletions[ncig-1]++;
307 void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line)
309 int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
310 int icig,iread=0,icycle=0;
311 int iref = bam_line->core.pos - stats->rseq_pos;
312 int read_len = bam_line->core.l_qseq;
313 uint8_t *read = bam1_seq(bam_line);
314 uint8_t *quals = bam1_qual(bam_line);
315 uint64_t *mpc_buf = stats->mpc_buf;
316 for (icig=0; icig<bam_line->core.n_cigar; icig++)
318 // Conversion from uint32_t to MIDNSHP
321 int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
322 int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
337 // Soft-clips are present in the sequence, but the position of the read marks a start of non-clipped sequence
348 error("TODO: cigar %d, %s\n", cig,bam1_qname(bam_line));
350 if ( ncig+iref > stats->rseq_len )
351 error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->rseq_len, bam1_qname(bam_line),stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1);
354 for (im=0; im<ncig; im++)
356 uint8_t cread = bam1_seqi(read,iread);
357 uint8_t cref = stats->rseq_buf[iref];
363 int idx = is_fwd ? icycle : read_len-icycle-1;
364 if ( idx>stats->max_len )
365 error("mpc: %d>%d\n",idx,stats->max_len);
366 idx = idx*stats->nquals;
367 if ( idx>=stats->nquals*stats->nbases )
368 error("FIXME: mpc_buf overflow\n");
371 else if ( cref && cread && cref!=cread )
373 uint8_t qual = quals[iread] + 1;
374 if ( qual>=stats->nquals )
375 error("TODO: quality too high %d>=%d\n", quals[iread],stats->nquals);
377 int idx = is_fwd ? icycle : read_len-icycle-1;
378 if ( idx>stats->max_len )
379 error("mpc: %d>%d\n",idx,stats->max_len);
381 idx = idx*stats->nquals + qual;
382 if ( idx>=stats->nquals*stats->nbases )
383 error("FIXME: mpc_buf overflow\n");
394 void read_ref_seq(stats_t *stats,int32_t tid,int32_t pos)
400 faidx_t *fai = stats->fai;
403 chr = stats->sam->header->target_name[tid];
405 // ID of the sequence name
406 iter = kh_get(s, h, chr);
407 if (iter == kh_end(h))
408 error("No such reference sequence [%s]?\n", chr);
409 val = kh_value(h, iter);
411 // Check the boundaries
413 error("Was the bam file mapped with the reference sequence supplied?"
414 " A read mapped beyond the end of the chromosome (%s:%d, chromosome length %d).\n", chr,pos,val.len);
415 int size = stats->nref_seq;
416 // The buffer extends beyond the chromosome end. Later the rest will be filled with N's.
417 if (size+pos > val.len) size = val.len-pos;
419 // Position the razf reader
420 razf_seek(fai->rz, val.offset + pos / val.line_blen * val.line_len + pos % val.line_blen, SEEK_SET);
422 uint8_t *ptr = stats->rseq_buf;
424 while ( nread<size && razf_read(fai->rz,&c,1) && !fai->rz->z_err )
429 // Conversion between uint8_t coding and ACGT
432 if ( c=='A' || c=='a' )
434 else if ( c=='C' || c=='c' )
436 else if ( c=='G' || c=='g' )
438 else if ( c=='T' || c=='t' )
445 if ( nread < stats->nref_seq )
447 memset(ptr,0, stats->nref_seq - nread);
448 nread = stats->nref_seq;
450 stats->rseq_len = nread;
451 stats->rseq_pos = pos;
455 float fai_gc_content(stats_t *stats)
458 int i,size = stats->rseq_len;
462 for (i=0; i<size; i++)
464 c = stats->rseq_buf[i];
470 else if ( c==1 || c==8 )
473 return count ? (float)gc/count : 0;
477 void realloc_buffers(stats_t *stats, int seq_len)
479 int n = 2*(1 + seq_len - stats->nbases) + stats->nbases;
481 stats->quals_1st = realloc(stats->quals_1st, n*stats->nquals*sizeof(uint64_t));
482 if ( !stats->quals_1st )
483 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
484 memset(stats->quals_1st + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
486 stats->quals_2nd = realloc(stats->quals_2nd, n*stats->nquals*sizeof(uint64_t));
487 if ( !stats->quals_2nd )
488 error("Could not realloc buffers, the sequence too long: %d (2x%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
489 memset(stats->quals_2nd + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
491 if ( stats->mpc_buf )
493 stats->mpc_buf = realloc(stats->mpc_buf, n*stats->nquals*sizeof(uint64_t));
494 if ( !stats->mpc_buf )
495 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
496 memset(stats->mpc_buf + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
499 stats->acgt_cycles = realloc(stats->acgt_cycles, n*4*sizeof(uint64_t));
500 if ( !stats->acgt_cycles )
501 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*4*sizeof(uint64_t));
502 memset(stats->acgt_cycles + stats->nbases*4, 0, (n-stats->nbases)*4*sizeof(uint64_t));
504 stats->read_lengths = realloc(stats->read_lengths, n*sizeof(uint64_t));
505 if ( !stats->read_lengths )
506 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
507 memset(stats->read_lengths + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
509 stats->insertions = realloc(stats->insertions, n*sizeof(uint64_t));
510 if ( !stats->insertions )
511 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
512 memset(stats->insertions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
514 stats->deletions = realloc(stats->deletions, n*sizeof(uint64_t));
515 if ( !stats->deletions )
516 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
517 memset(stats->deletions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
519 stats->ins_cycles = realloc(stats->ins_cycles, (n+1)*sizeof(uint64_t));
520 if ( !stats->ins_cycles )
521 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
522 memset(stats->ins_cycles + stats->nbases + 1, 0, (n+1-stats->nbases)*sizeof(uint64_t));
524 stats->del_cycles = realloc(stats->del_cycles, (n+1)*sizeof(uint64_t));
525 if ( !stats->del_cycles )
526 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
527 memset(stats->del_cycles + stats->nbases + 1, 0, (n+1-stats->nbases)*sizeof(uint64_t));
531 // Realloc the coverage distribution buffer
532 int *rbuffer = calloc(sizeof(int),seq_len*5);
533 n = stats->cov_rbuf.size-stats->cov_rbuf.start;
534 memcpy(rbuffer,stats->cov_rbuf.buffer+stats->cov_rbuf.start,n);
535 if ( stats->cov_rbuf.start>1 )
536 memcpy(rbuffer+n,stats->cov_rbuf.buffer,stats->cov_rbuf.start);
537 stats->cov_rbuf.start = 0;
538 free(stats->cov_rbuf.buffer);
539 stats->cov_rbuf.buffer = rbuffer;
540 stats->cov_rbuf.size = seq_len*5;
543 void collect_stats(bam1_t *bam_line, stats_t *stats)
545 if ( stats->rmdup && IS_DUP(bam_line) )
548 int seq_len = bam_line->core.l_qseq;
549 if ( !seq_len ) return;
550 if ( stats->filter_readlen!=-1 && seq_len!=stats->filter_readlen ) return;
551 if ( seq_len >= stats->nbases )
552 realloc_buffers(stats,seq_len);
553 if ( stats->max_len<seq_len )
554 stats->max_len = seq_len;
556 stats->read_lengths[seq_len]++;
558 // Count GC and ACGT per cycle
559 uint8_t base, *seq = bam1_seq(bam_line);
562 int reverse = IS_REVERSE(bam_line);
563 for (i=0; i<seq_len; i++)
565 // Conversion from uint8_t coding to ACGT
569 base = bam1_seqi(seq,i);
571 if ( base==1 || base==2 ) gc_count++;
572 else if ( base>2 ) base=3;
573 if ( 4*(reverse ? seq_len-i-1 : i) + base >= stats->nbases*4 )
574 error("FIXME: acgt_cycles\n");
575 stats->acgt_cycles[ 4*(reverse ? seq_len-i-1 : i) + base ]++;
577 int gc_idx_min = gc_count*(stats->ngc-1)/seq_len;
578 int gc_idx_max = (gc_count+1)*(stats->ngc-1)/seq_len;
579 if ( gc_idx_max >= stats->ngc ) gc_idx_max = stats->ngc - 1;
581 // Determine which array (1st or 2nd read) will these stats go to,
582 // trim low quality bases from end the same way BWA does,
585 uint8_t *bam_quals = bam1_qual(bam_line);
586 if ( bam_line->core.flag&BAM_FREAD2 )
588 quals = stats->quals_2nd;
590 for (i=gc_idx_min; i<gc_idx_max; i++)
595 quals = stats->quals_1st;
597 for (i=gc_idx_min; i<gc_idx_max; i++)
600 if ( stats->trim_qual>0 )
601 stats->nbases_trimmed += bwa_trim_read(stats->trim_qual, bam_quals, seq_len, reverse);
603 // Quality histogram and average quality
604 for (i=0; i<seq_len; i++)
606 uint8_t qual = bam_quals[ reverse ? seq_len-i-1 : i];
607 if ( qual>=stats->nquals )
608 error("TODO: quality too high %d>=%d\n", quals[i],stats->nquals);
609 if ( qual>stats->max_qual )
610 stats->max_qual = qual;
612 quals[ i*stats->nquals+qual ]++;
613 stats->sum_qual += qual;
616 // Look at the flags and increment appropriate counters (mapped, paired, etc)
617 if ( IS_UNMAPPED(bam_line) )
618 stats->nreads_unmapped++;
621 if ( !bam_line->core.qual )
624 count_indels(stats,bam_line);
626 // The insert size is tricky, because for long inserts the libraries are
627 // prepared differently and the pairs point in other direction. BWA does
628 // not set the paired flag for them. Similar thing is true also for 454
629 // reads. Therefore, do the insert size stats for all mapped reads.
630 int32_t isize = bam_line->core.isize;
631 if ( isize<0 ) isize = -isize;
632 if ( IS_PAIRED(bam_line) && isize!=0 )
634 stats->nreads_paired++;
635 if ( isize >= stats->nisize )
636 isize=stats->nisize-1;
638 int pos_fst = bam_line->core.mpos - bam_line->core.pos;
639 int is_fst = IS_READ1(bam_line) ? 1 : -1;
640 int is_fwd = IS_REVERSE(bam_line) ? -1 : 1;
641 int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1;
643 if ( is_fwd*is_mfwd>0 )
644 stats->isize_other[isize]++;
645 else if ( is_fst*pos_fst>0 )
647 if ( is_fst*is_fwd>0 )
648 stats->isize_inward[isize]++;
650 stats->isize_outward[isize]++;
652 else if ( is_fst*pos_fst<0 )
654 if ( is_fst*is_fwd>0 )
655 stats->isize_outward[isize]++;
657 stats->isize_inward[isize]++;
661 stats->nreads_unpaired++;
663 // Number of mismatches
664 uint8_t *nm = bam_aux_get(bam_line,"NM");
666 stats->nmismatches += bam_aux2i(nm);
668 // Number of mapped bases from cigar
669 if ( bam_line->core.n_cigar == 0)
670 error("FIXME: mapped read with no cigar?\n");
671 int readlen = seq_len;
672 for (i=0; i<bam_line->core.n_cigar; i++)
674 // Conversion from uint32_t to MIDNSHP
677 if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==0 || (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==1 )
678 stats->nbases_mapped_cigar += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
680 if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==2 )
681 readlen += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
683 stats->nbases_mapped += seq_len;
685 if ( stats->tid==bam_line->core.tid && bam_line->core.pos<stats->pos )
686 stats->is_sorted = 0;
687 stats->pos = bam_line->core.pos;
689 if ( stats->is_sorted )
691 if ( stats->tid==-1 || stats->tid!=bam_line->core.tid )
692 round_buffer_flush(stats,-1);
694 // Mismatches per cycle and GC-depth graph
697 // First pass, new chromosome or sequence spanning beyond the end of the buffer
698 if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid || stats->rseq_pos+stats->rseq_len < bam_line->core.pos+readlen )
700 read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos);
702 // Get the reference GC content for this bin. Note that in the current implementation
703 // the GCD bins overlap by seq_len-1. Because the bin size is by default 20k and the
704 // expected read length only ~100bp, it shouldn't really matter.
705 stats->gcd_pos = bam_line->core.pos;
708 if ( stats->igcd >= stats->ngcd )
709 error("The genome too long or the GCD bin overlaps too big [%ud]\n", stats->igcd);
711 stats->gcd[ stats->igcd ].gc = fai_gc_content(stats);
713 count_mismatches_per_cycle(stats,bam_line);
715 else if ( stats->gcd_pos==-1 || stats->tid != bam_line->core.tid || bam_line->core.pos - stats->gcd_pos > stats->gcd_bin_size )
717 // First pass or a new chromosome
718 stats->tid = bam_line->core.tid;
719 stats->gcd_pos = bam_line->core.pos;
722 if ( stats->igcd >= stats->ngcd )
724 uint32_t n = 2*(1 + stats->ngcd);
725 stats->gcd = realloc(stats->gcd, n*sizeof(gc_depth_t));
727 error("Could not realloc GCD buffer, too many chromosomes or the genome too long?? [%u %u]\n", stats->ngcd,n);
728 memset(&(stats->gcd[stats->ngcd]),0,(n-stats->ngcd)*sizeof(gc_depth_t));
732 stats->gcd[ stats->igcd ].depth++;
733 // When no reference sequence is given, approximate the GC from the read (much shorter window, but otherwise OK)
735 stats->gcd[ stats->igcd ].gc += (float) gc_count / seq_len;
737 // Coverage distribution graph
738 round_buffer_flush(stats,bam_line->core.pos);
739 round_buffer_insert_read(&(stats->cov_rbuf),bam_line->core.pos,bam_line->core.pos+seq_len-1);
743 stats->total_len += seq_len;
744 if ( IS_DUP(bam_line) )
746 stats->total_len_dup += seq_len;
751 // Sort by GC and depth
752 #define GCD_t(x) ((gc_depth_t *)x)
753 static int gcd_cmp(const void *a, const void *b)
755 if ( GCD_t(a)->gc < GCD_t(b)->gc ) return -1;
756 if ( GCD_t(a)->gc > GCD_t(b)->gc ) return 1;
757 if ( GCD_t(a)->depth < GCD_t(b)->depth ) return -1;
758 if ( GCD_t(a)->depth > GCD_t(b)->depth ) return 1;
763 float gcd_percentile(gc_depth_t *gcd, int N, int p)
773 return gcd[N-1].depth;
776 return gcd[k-1].depth + d*(gcd[k].depth - gcd[k-1].depth);
779 void output_stats(stats_t *stats)
781 // Calculate average insert size and standard deviation (from the main bulk data only)
783 uint64_t nisize=0, nisize_inward=0, nisize_outward=0, nisize_other=0;
784 for (isize=1; isize<stats->nisize; isize++)
786 // Each pair was counted twice
787 stats->isize_inward[isize] *= 0.5;
788 stats->isize_outward[isize] *= 0.5;
789 stats->isize_other[isize] *= 0.5;
791 nisize_inward += stats->isize_inward[isize];
792 nisize_outward += stats->isize_outward[isize];
793 nisize_other += stats->isize_other[isize];
794 nisize += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
797 double bulk=0, avg_isize=0, sd_isize=0;
798 for (isize=1; isize<stats->nisize; isize++)
800 bulk += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
801 avg_isize += isize * (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]);
803 if ( bulk/nisize > stats->isize_main_bulk )
810 avg_isize /= nisize ? nisize : 1;
811 for (isize=1; isize<ibulk; isize++)
812 sd_isize += (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]) * (isize-avg_isize)*(isize-avg_isize) / nisize;
813 sd_isize = sqrt(sd_isize);
816 printf("# This file was produced by bamcheck (%s)\n",BAMCHECK_VERSION);
817 printf("# The command line was: %s",stats->argv[0]);
819 for (i=1; i<stats->argc; i++)
820 printf(" %s",stats->argv[i]);
822 printf("# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n");
823 printf("SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd));
824 printf("SN\tis paired:\t%d\n", stats->nreads_1st&&stats->nreads_2nd ? 1 : 0);
825 printf("SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0);
826 printf("SN\t1st fragments:\t%ld\n", (long)stats->nreads_1st);
827 printf("SN\tlast fragments:\t%ld\n", (long)stats->nreads_2nd);
828 printf("SN\treads mapped:\t%ld\n", (long)(stats->nreads_paired+stats->nreads_unpaired));
829 printf("SN\treads unmapped:\t%ld\n", (long)stats->nreads_unmapped);
830 printf("SN\treads unpaired:\t%ld\n", (long)stats->nreads_unpaired);
831 printf("SN\treads paired:\t%ld\n", (long)stats->nreads_paired);
832 printf("SN\treads duplicated:\t%ld\n", (long)stats->nreads_dup);
833 printf("SN\treads MQ0:\t%ld\n", (long)stats->nreads_mq0);
834 printf("SN\ttotal length:\t%ld\n", (long)stats->total_len);
835 printf("SN\tbases mapped:\t%ld\n", (long)stats->nbases_mapped);
836 printf("SN\tbases mapped (cigar):\t%ld\n", (long)stats->nbases_mapped_cigar);
837 printf("SN\tbases trimmed:\t%ld\n", (long)stats->nbases_trimmed);
838 printf("SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup);
839 printf("SN\tmismatches:\t%ld\n", (long)stats->nmismatches);
840 printf("SN\terror rate:\t%e\n", (float)stats->nmismatches/stats->nbases_mapped_cigar);
841 float avg_read_length = (stats->nreads_1st+stats->nreads_2nd)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd):0;
842 printf("SN\taverage length:\t%.0f\n", avg_read_length);
843 printf("SN\tmaximum length:\t%d\n", stats->max_len);
844 printf("SN\taverage quality:\t%.1f\n", stats->total_len?stats->sum_qual/stats->total_len:0);
845 printf("SN\tinsert size average:\t%.1f\n", avg_isize);
846 printf("SN\tinsert size standard deviation:\t%.1f\n", sd_isize);
847 printf("SN\tinward oriented pairs:\t%ld\n", (long)nisize_inward);
848 printf("SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward);
849 printf("SN\tpairs with other orientation:\t%ld\n", (long)nisize_other);
852 if ( stats->max_len<stats->nbases ) stats->max_len++;
853 if ( stats->max_qual+1<stats->nquals ) stats->max_qual++;
854 printf("# First Fragment Qualitites. Use `grep ^FFQ | cut -f 2-` to extract this part.\n");
855 printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
856 for (ibase=0; ibase<stats->max_len; ibase++)
858 printf("FFQ\t%d",ibase+1);
859 for (iqual=0; iqual<=stats->max_qual; iqual++)
861 printf("\t%ld", (long)stats->quals_1st[ibase*stats->nquals+iqual]);
865 printf("# Last Fragment Qualitites. Use `grep ^LFQ | cut -f 2-` to extract this part.\n");
866 printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
867 for (ibase=0; ibase<stats->max_len; ibase++)
869 printf("LFQ\t%d",ibase+1);
870 for (iqual=0; iqual<=stats->max_qual; iqual++)
872 printf("\t%ld", (long)stats->quals_2nd[ibase*stats->nquals+iqual]);
876 if ( stats->mpc_buf )
878 printf("# Mismatches per cycle and quality. Use `grep ^MPC | cut -f 2-` to extract this part.\n");
879 printf("# Columns correspond to qualities, rows to cycles. First column is the cycle number, second\n");
880 printf("# is the number of N's and the rest is the number of mismatches\n");
881 for (ibase=0; ibase<stats->max_len; ibase++)
883 printf("MPC\t%d",ibase+1);
884 for (iqual=0; iqual<=stats->max_qual; iqual++)
886 printf("\t%ld", (long)stats->mpc_buf[ibase*stats->nquals+iqual]);
891 printf("# GC Content of first fragments. Use `grep ^GCF | cut -f 2-` to extract this part.\n");
893 for (ibase=0; ibase<stats->ngc; ibase++)
895 if ( stats->gc_1st[ibase]==stats->gc_1st[ibase_prev] ) continue;
896 printf("GCF\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_1st[ibase_prev]);
899 printf("# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part.\n");
901 for (ibase=0; ibase<stats->ngc; ibase++)
903 if ( stats->gc_2nd[ibase]==stats->gc_2nd[ibase_prev] ) continue;
904 printf("GCL\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_2nd[ibase_prev]);
907 printf("# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle, and A,C,G,T counts [%%]\n");
908 for (ibase=0; ibase<stats->max_len; ibase++)
910 uint64_t *ptr = &(stats->acgt_cycles[ibase*4]);
911 uint64_t sum = ptr[0]+ptr[1]+ptr[2]+ptr[3];
912 if ( ! sum ) continue;
913 printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum);
915 printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n");
916 for (isize=1; isize<ibulk; isize++)
917 printf("IS\t%d\t%ld\t%ld\t%ld\t%ld\n", isize, (long)(stats->isize_inward[isize]+stats->isize_outward[isize]+stats->isize_other[isize]),
918 (long)stats->isize_inward[isize], (long)stats->isize_outward[isize], (long)stats->isize_other[isize]);
920 printf("# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count\n");
922 for (ilen=0; ilen<stats->max_len; ilen++)
924 if ( stats->read_lengths[ilen]>0 )
925 printf("RL\t%d\t%ld\n", ilen, (long)stats->read_lengths[ilen]);
928 printf("# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n");
929 for (ilen=0; ilen<stats->nindels; ilen++)
931 if ( stats->insertions[ilen]>0 || stats->deletions[ilen]>0 )
932 printf("ID\t%d\t%ld\t%ld\n", ilen+1, (long)stats->insertions[ilen], (long)stats->deletions[ilen]);
935 printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions, number of deletions\n");
936 for (ilen=0; ilen<=stats->nbases; ilen++)
938 if ( stats->ins_cycles[ilen]>0 || stats->del_cycles[ilen]>0 )
939 printf("IC\t%d\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles[ilen], (long)stats->del_cycles[ilen]);
942 printf("# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n");
943 printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1, (long)stats->cov[0]);
945 for (icov=1; icov<stats->ncov-1; icov++)
946 printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1, (long)stats->cov[icov]);
947 printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1, (long)stats->cov[stats->ncov-1]);
950 // Calculate average GC content, then sort by GC and depth
951 printf("# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n");
953 for (igcd=0; igcd<stats->igcd; igcd++)
956 stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc);
958 if ( stats->gcd[igcd].depth )
959 stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc / stats->gcd[igcd].depth);
961 qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp);
963 while ( igcd < stats->igcd )
965 // Calculate percentiles (10,25,50,75,90th) for the current GC content and print
966 uint32_t nbins=0, itmp=igcd;
967 float gc = stats->gcd[igcd].gc;
968 while ( itmp<stats->igcd && fabs(stats->gcd[itmp].gc-gc)<0.1 )
973 printf("GCD\t%.1f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", gc, (igcd+nbins+1)*100./(stats->igcd+1),
974 gcd_percentile(&(stats->gcd[igcd]),nbins,10) *avg_read_length/stats->gcd_bin_size,
975 gcd_percentile(&(stats->gcd[igcd]),nbins,25) *avg_read_length/stats->gcd_bin_size,
976 gcd_percentile(&(stats->gcd[igcd]),nbins,50) *avg_read_length/stats->gcd_bin_size,
977 gcd_percentile(&(stats->gcd[igcd]),nbins,75) *avg_read_length/stats->gcd_bin_size,
978 gcd_percentile(&(stats->gcd[igcd]),nbins,90) *avg_read_length/stats->gcd_bin_size
984 void bam_init_header_hash(bam_header_t *header);
986 size_t mygetline(char **line, size_t *n, FILE *fp)
988 if (line == NULL || n == NULL || fp == NULL)
1001 while ((c=getc(fp))!= EOF && c!='\n')
1006 *line = realloc(*line, sizeof(char)*(*n));
1008 (*line)[nread-1] = c;
1013 *line = realloc(*line, sizeof(char)*(*n));
1016 return nread>0 ? nread : -1;
1020 void init_regions(stats_t *stats, char *file)
1023 khash_t(str) *header_hash;
1025 bam_init_header_hash(stats->sam->header);
1026 header_hash = (khash_t(str)*)stats->sam->header->hash;
1028 FILE *fp = fopen(file,"r");
1029 if ( !fp ) error("%s: %s\n",file,strerror(errno));
1035 int prev_tid=-1, prev_pos=-1;
1036 while ((nread = mygetline(&line, &len, fp)) != -1)
1038 if ( line[0] == '#' ) continue;
1041 while ( i<nread && !isspace(line[i]) ) i++;
1042 if ( i>=nread ) error("Could not parse the file: %s [%s]\n", file,line);
1045 iter = kh_get(str, header_hash, line);
1046 int tid = kh_val(header_hash, iter);
1047 if ( iter == kh_end(header_hash) )
1050 fprintf(stderr,"Warning: Some sequences not present in the BAM (%s)\n", line);
1055 if ( tid >= stats->nregions )
1057 stats->regions = realloc(stats->regions,sizeof(regions_t)*(stats->nregions+100));
1059 for (j=stats->nregions; j<stats->nregions+100; j++)
1061 stats->regions[j].npos = stats->regions[j].mpos = stats->regions[j].cpos = 0;
1062 stats->regions[j].pos = NULL;
1064 stats->nregions += 100;
1066 int npos = stats->regions[tid].npos;
1067 if ( npos >= stats->regions[tid].mpos )
1069 stats->regions[tid].mpos += 1000;
1070 stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos);
1073 if ( (sscanf(line+i+1,"%d %d",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n");
1074 if ( prev_tid==-1 || prev_tid!=tid )
1077 prev_pos = stats->regions[tid].pos[npos].from;
1079 if ( prev_pos>stats->regions[tid].pos[npos].from )
1080 error("The positions are not in chromosomal order (%s:%d comes after %d)\n", line,stats->regions[tid].pos[npos].from,prev_pos);
1081 stats->regions[tid].npos++;
1083 if (line) free(line);
1087 void destroy_regions(stats_t *stats)
1090 for (i=0; i<stats->nregions; i++)
1092 if ( !stats->regions[i].mpos ) continue;
1093 free(stats->regions[i].pos);
1095 if ( stats->regions ) free(stats->regions);
1098 static int fetch_read(const bam1_t *bam_line, void *data)
1100 collect_stats((bam1_t*)bam_line,(stats_t*)data);
1105 void error(const char *format, ...)
1109 printf("Version: %s\n", BAMCHECK_VERSION);
1110 printf("About: The program collects statistics from BAM files. The output can be visualized using plot-bamcheck.\n");
1111 printf("Usage: bamcheck [OPTIONS] file.bam\n");
1112 printf(" bamcheck [OPTIONS] file.bam chr:from-to\n");
1113 printf("Options:\n");
1114 printf(" -c, --coverage <int>,<int>,<int> Coverage distribution min,max,step [1,1000,1]\n");
1115 printf(" -d, --remove-dups Exlude from statistics reads marked as duplicates\n");
1116 printf(" -h, --help This help message\n");
1117 printf(" -i, --insert-size <int> Maximum insert size [8000]\n");
1118 printf(" -l, --read-length <int> Include in the statistics only reads with the given read length []\n");
1119 printf(" -m, --most-inserts <float> Report only the main part of inserts [0.99]\n");
1120 printf(" -q, --trim-quality <int> The BWA trimming parameter [0]\n");
1121 printf(" -r, --ref-seq <file> Reference sequence (required for GC-depth calculation).\n");
1122 printf(" -t, --target-regions <file> Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n");
1123 printf(" -s, --sam Input is SAM\n");
1129 va_start(ap, format);
1130 vfprintf(stderr, format, ap);
1136 int main(int argc, char *argv[])
1138 char *targets = NULL;
1139 char *bam_fname = NULL;
1140 samfile_t *sam = NULL;
1143 stats_t *stats = calloc(1,sizeof(stats_t));
1146 stats->nbases = 300;
1147 stats->nisize = 8000;
1148 stats->max_len = 30;
1149 stats->max_qual = 40;
1150 stats->isize_main_bulk = 0.99; // There are always outliers at the far end
1151 stats->gcd_bin_size = 20000;
1152 stats->ngcd = 3e5; // 300k of 20k bins is enough to hold a genome 6Gbp big
1153 stats->nref_seq = stats->gcd_bin_size;
1154 stats->rseq_pos = -1;
1155 stats->tid = stats->gcd_pos = -1;
1156 stats->is_sorted = 1;
1158 stats->cov_max = 1000;
1159 stats->cov_step = 1;
1162 stats->filter_readlen = -1;
1163 stats->nindels = stats->nbases;
1165 strcpy(in_mode, "rb");
1167 static struct option loptions[] =
1170 {"remove-dups",0,0,'d'},
1172 {"ref-seq",0,0,'r'},
1173 {"coverage",0,0,'c'},
1174 {"read-length",0,0,'l'},
1175 {"insert-size",0,0,'i'},
1176 {"most-inserts",0,0,'m'},
1177 {"trim-quality",0,0,'q'},
1178 {"target-regions",0,0,'t'},
1182 while ( (opt=getopt_long(argc,argv,"?hdsr:c:l:i:t:m:q:",loptions,NULL))>0 )
1186 case 'd': stats->rmdup=1; break;
1187 case 's': strcpy(in_mode, "r"); break;
1188 case 'r': stats->fai = fai_load(optarg);
1190 error("Could not load faidx: %s\n", optarg);
1192 case 'c': if ( sscanf(optarg,"%d,%d,%d",&stats->cov_min,&stats->cov_max,&stats->cov_step)!= 3 )
1193 error("Unable to parse -c %s\n", optarg);
1195 case 'l': stats->filter_readlen = atoi(optarg); break;
1196 case 'i': stats->nisize = atoi(optarg); break;
1197 case 'm': stats->isize_main_bulk = atof(optarg); break;
1198 case 'q': stats->trim_qual = atoi(optarg); break;
1199 case 't': targets = optarg; break;
1201 case 'h': error(NULL);
1202 default: error("Unknown argument: %s\n", optarg);
1206 bam_fname = argv[optind++];
1210 if ( isatty(fileno((FILE *)stdin)) )
1216 // .. coverage bins and round buffer
1217 if ( stats->cov_step > stats->cov_max - stats->cov_min + 1 )
1219 stats->cov_step = stats->cov_max - stats->cov_min;
1220 if ( stats->cov_step <= 0 )
1221 stats->cov_step = 1;
1223 stats->ncov = 3 + (stats->cov_max-stats->cov_min) / stats->cov_step;
1224 stats->cov_max = stats->cov_min + ((stats->cov_max-stats->cov_min)/stats->cov_step +1)*stats->cov_step - 1;
1225 stats->cov = calloc(sizeof(uint64_t),stats->ncov);
1226 stats->cov_rbuf.size = stats->nbases*5;
1227 stats->cov_rbuf.buffer = calloc(sizeof(int32_t),stats->cov_rbuf.size);
1229 if ((sam = samopen(bam_fname, in_mode, NULL)) == 0)
1230 error("Failed to open: %s\n", bam_fname);
1232 bam1_t *bam_line = bam_init1();
1234 stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
1235 stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
1236 stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t));
1237 stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t));
1238 stats->isize_inward = calloc(stats->nisize,sizeof(uint64_t));
1239 stats->isize_outward = calloc(stats->nisize,sizeof(uint64_t));
1240 stats->isize_other = calloc(stats->nisize,sizeof(uint64_t));
1241 stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t));
1242 stats->rseq_buf = calloc(stats->nref_seq,sizeof(uint8_t));
1243 stats->mpc_buf = stats->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL;
1244 stats->acgt_cycles = calloc(4*stats->nbases,sizeof(uint64_t));
1245 stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t));
1246 stats->insertions = calloc(stats->nbases,sizeof(uint64_t));
1247 stats->deletions = calloc(stats->nbases,sizeof(uint64_t));
1248 stats->ins_cycles = calloc(stats->nbases+1,sizeof(uint64_t));
1249 stats->del_cycles = calloc(stats->nbases+1,sizeof(uint64_t));
1251 init_regions(stats, targets);
1253 // Collect statistics
1256 // Collect stats in selected regions only
1257 bam_index_t *bam_idx = bam_index_load(bam_fname);
1259 error("Random alignment retrieval only works for indexed BAM files.\n");
1262 for (i=optind; i<argc; i++)
1265 bam_parse_region(stats->sam->header, argv[i], &tid, &beg, &end);
1266 if ( tid < 0 ) continue;
1267 bam_fetch(stats->sam->x.bam, bam_idx, tid, beg, end, stats, fetch_read);
1269 bam_index_destroy(bam_idx);
1273 // Stream through the entire BAM ignoring off-target regions if -t is given
1274 while (samread(sam,bam_line) >= 0)
1276 if ( stats->regions )
1278 if ( bam_line->core.tid >= stats->nregions || bam_line->core.tid<0 ) continue;
1279 if ( !stats->is_sorted ) error("The BAM must be sorted in order for -t to work.\n");
1281 regions_t *reg = &stats->regions[bam_line->core.tid];
1282 if ( reg->cpos==reg->npos ) continue; // done for this chr
1284 // Find a matching interval or skip this read. No splicing of reads is done, no indels or soft clips considered,
1285 // even small overlap is enough to include the read in the stats.
1287 while ( i<reg->npos && reg->pos[i].to<=bam_line->core.pos ) i++;
1288 if ( i>=reg->npos ) { reg->cpos = reg->npos; continue; }
1289 if ( bam_line->core.pos + bam_line->core.l_qseq + 1 < reg->pos[i].from ) continue;
1292 collect_stats(bam_line,stats);
1295 round_buffer_flush(stats,-1);
1297 output_stats(stats);
1299 bam_destroy1(bam_line);
1300 samclose(stats->sam);
1301 if (stats->fai) fai_destroy(stats->fai);
1302 free(stats->cov_rbuf.buffer); free(stats->cov);
1303 free(stats->quals_1st); free(stats->quals_2nd);
1304 free(stats->gc_1st); free(stats->gc_2nd);
1305 free(stats->isize_inward); free(stats->isize_outward); free(stats->isize_other);
1307 free(stats->rseq_buf);
1308 free(stats->mpc_buf);
1309 free(stats->acgt_cycles);
1310 free(stats->read_lengths);
1311 free(stats->insertions);
1312 free(stats->deletions);
1313 free(stats->ins_cycles);
1314 free(stats->del_cycles);
1315 destroy_regions(stats);