2 Author: petr.danecek@sanger
3 gcc -Wall -Winline -g -O2 -I ~/git/samtools bamcheck.c -o bamcheck -lm -lz -L ~/git/samtools -lbam
5 Assumptions, approximations and other issues:
6 - GC-depth graph does not split reads, the starting position determines which bin is incremented.
7 There are small overlaps between bins (max readlen-1). However, the bins are big (20k).
8 - coverage distribution ignores softclips and deletions
9 - some stats require sorted BAMs
10 - GC content graph can have an untidy, step-like pattern when BAM contains multiple read lengths.
11 - 'bases mapped' (stats->nbases_mapped) is calculated from read lengths given by BAM (core.l_qseq)
12 - With the -t option, the whole reads are used. Except for the number of mapped bases (cigar)
13 counts, no splicing is done, no indels or soft clips are considered, even small overlap is
14 good enough to include the read in the stats.
18 #define BAMCHECK_VERSION "2012-04-23"
20 #define _ISOC99_SOURCE
34 #define BWA_MIN_RDLEN 35
35 #define IS_PAIRED(bam) ((bam)->core.flag&BAM_FPAIRED && !((bam)->core.flag&BAM_FUNMAP) && !((bam)->core.flag&BAM_FMUNMAP))
36 #define IS_UNMAPPED(bam) ((bam)->core.flag&BAM_FUNMAP)
37 #define IS_REVERSE(bam) ((bam)->core.flag&BAM_FREVERSE)
38 #define IS_MATE_REVERSE(bam) ((bam)->core.flag&BAM_FMREVERSE)
39 #define IS_READ1(bam) ((bam)->core.flag&BAM_FREAD1)
40 #define IS_READ2(bam) ((bam)->core.flag&BAM_FREAD2)
41 #define IS_DUP(bam) ((bam)->core.flag&BAM_FDUP)
45 int32_t line_len, line_blen;
50 KHASH_MAP_INIT_STR(s, faidx1_t)
51 KHASH_MAP_INIT_STR(str, int)
66 // For coverage distribution, a simple pileup
75 typedef struct { uint32_t from, to; } pos_t;
86 int trim_qual; // bwa trim quality
87 int rmdup; // Exclude reads marked as duplicates from the stats
89 // Dimensions of the quality histogram holder (quals_1st,quals_2nd), GC content holder (gc_1st,gc_2nd),
90 // insert size histogram holder
91 int nquals; // The number of quality bins
92 int nbases; // The maximum sequence length the allocated array can hold
93 int nisize; // The maximum insert size that the allocated array can hold
94 int ngc; // The size of gc_1st and gc_2nd
95 int nindels; // The maximum indel length for indel distribution
97 // Arrays for the histogram data
98 uint64_t *quals_1st, *quals_2nd;
99 uint64_t *gc_1st, *gc_2nd;
100 uint64_t *isize_inward, *isize_outward, *isize_other;
101 uint64_t *acgt_cycles;
102 uint64_t *read_lengths;
103 uint64_t *insertions, *deletions;
104 uint64_t *ins_cycles, *del_cycles;
106 // The extremes encountered
107 int max_len; // Maximum read length
108 int max_qual; // Maximum quality
109 float isize_main_bulk; // There are always some unrealistically big insert sizes, report only the main part
114 uint64_t total_len_dup;
118 uint64_t nreads_unmapped;
119 uint64_t nreads_unpaired;
120 uint64_t nreads_paired;
122 uint64_t nbases_mapped;
123 uint64_t nbases_mapped_cigar;
124 uint64_t nbases_trimmed; // bwa trimmed bases
125 uint64_t nmismatches;
127 // GC-depth related data
128 uint32_t ngcd, igcd; // The maximum number of GC depth bins and index of the current bin
129 gc_depth_t *gcd; // The GC-depth bins holder
130 int gcd_bin_size; // The size of GC-depth bin
131 int32_t tid, gcd_pos; // Position of the current bin
132 int32_t pos; // Position of the last read
134 // Coverage distribution related data
135 int ncov; // The number of coverage bins
136 uint64_t *cov; // The coverage frequencies
137 int cov_min,cov_max,cov_step; // Minimum, maximum coverage and size of the coverage bins
138 round_buffer_t cov_rbuf; // Pileup round buffer
140 // Mismatches by read cycle
141 uint8_t *rseq_buf; // A buffer for reference sequence to check the mismatches against
142 int nref_seq; // The size of the buffer
143 int32_t rseq_pos; // The coordinate of the first base in the buffer
144 int32_t rseq_len; // The used part of the buffer
145 uint64_t *mpc_buf; // Mismatches per cycle
151 int nregions, reg_from,reg_to;
155 double sum_qual; // For calculating average quality value
156 samfile_t *sam; // Unused
157 faidx_t *fai; // Reference sequence for GC-depth graph
158 int argc; // Command line arguments to be printed on the output
163 void error(const char *format, ...);
164 void bam_init_header_hash(bam_header_t *header);
165 int is_in_regions(bam1_t *bam_line, stats_t *stats);
168 // Coverage distribution methods
169 inline int coverage_idx(int min, int max, int n, int step, int depth)
177 return 1 + (depth - min) / step;
180 inline int round_buffer_lidx2ridx(int offset, int size, int64_t refpos, int64_t pos)
182 return (offset + (pos-refpos) % size) % size;
185 void round_buffer_flush(stats_t *stats, int64_t pos)
189 if ( pos==stats->cov_rbuf.pos )
192 int64_t new_pos = pos;
193 if ( pos==-1 || pos - stats->cov_rbuf.pos >= stats->cov_rbuf.size )
195 // Flush the whole buffer, but in sequential order,
196 pos = stats->cov_rbuf.pos + stats->cov_rbuf.size - 1;
199 if ( pos < stats->cov_rbuf.pos )
200 error("Expected coordinates in ascending order, got %ld after %ld\n", pos,stats->cov_rbuf.pos);
202 int ifrom = stats->cov_rbuf.start;
203 int ito = round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos-1);
206 for (ibuf=ifrom; ibuf<stats->cov_rbuf.size; ibuf++)
208 if ( !stats->cov_rbuf.buffer[ibuf] )
210 idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
212 stats->cov_rbuf.buffer[ibuf] = 0;
216 for (ibuf=ifrom; ibuf<=ito; ibuf++)
218 if ( !stats->cov_rbuf.buffer[ibuf] )
220 idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
222 stats->cov_rbuf.buffer[ibuf] = 0;
224 stats->cov_rbuf.start = (new_pos==-1) ? 0 : round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos);
225 stats->cov_rbuf.pos = new_pos;
228 void round_buffer_insert_read(round_buffer_t *rbuf, int64_t from, int64_t to)
230 if ( to-from >= rbuf->size )
231 error("The read length too big (%d), please increase the buffer length (currently %d)\n", to-from+1,rbuf->size);
232 if ( from < rbuf->pos )
233 error("The reads are not sorted (%ld comes after %ld).\n", from,rbuf->pos);
236 ifrom = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,from);
237 ito = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,to);
240 for (ibuf=ifrom; ibuf<rbuf->size; ibuf++)
241 rbuf->buffer[ibuf]++;
244 for (ibuf=ifrom; ibuf<=ito; ibuf++)
245 rbuf->buffer[ibuf]++;
248 // Calculate the number of bases in the read trimmed by BWA
249 int bwa_trim_read(int trim_qual, uint8_t *quals, int len, int reverse)
251 if ( len<BWA_MIN_RDLEN ) return 0;
253 // Although the name implies that the read cannot be trimmed to more than BWA_MIN_RDLEN,
254 // the calculation can in fact trim it to (BWA_MIN_RDLEN-1). (bwa_trim_read in bwa/bwaseqio.c).
255 int max_trimmed = len - BWA_MIN_RDLEN + 1;
256 int l, sum=0, max_sum=0, max_l=0;
258 for (l=0; l<max_trimmed; l++)
260 sum += trim_qual - quals[ reverse ? l : len-1-l ];
265 // This is the correct way, but bwa clips from some reason one base less
274 void count_indels(stats_t *stats,bam1_t *bam_line)
276 int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
279 int read_len = bam_line->core.l_qseq;
280 for (icig=0; icig<bam_line->core.n_cigar; icig++)
282 // Conversion from uint32_t to MIDNSHP
285 int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
286 int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
290 int idx = is_fwd ? icycle : read_len-icycle;
291 if ( idx<0 ) error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle);
292 if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d\n", idx,stats->nbases);
293 stats->ins_cycles[idx]++;
295 if ( ncig<=stats->nindels )
296 stats->insertions[ncig-1]++;
301 int idx = is_fwd ? icycle-1 : read_len-icycle-1;
302 if ( idx<0 ) continue; // discard meaningless deletions
303 if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases);
304 stats->del_cycles[idx]++;
305 if ( ncig<=stats->nindels )
306 stats->deletions[ncig-1]++;
313 void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line)
315 int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
316 int icig,iread=0,icycle=0;
317 int iref = bam_line->core.pos - stats->rseq_pos;
318 int read_len = bam_line->core.l_qseq;
319 uint8_t *read = bam1_seq(bam_line);
320 uint8_t *quals = bam1_qual(bam_line);
321 uint64_t *mpc_buf = stats->mpc_buf;
322 for (icig=0; icig<bam_line->core.n_cigar; icig++)
324 // Conversion from uint32_t to MIDNSHP
327 int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
328 int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
343 // Soft-clips are present in the sequence, but the position of the read marks a start of non-clipped sequence
354 error("TODO: cigar %d, %s\n", cig,bam1_qname(bam_line));
356 if ( ncig+iref > stats->rseq_len )
357 error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->rseq_len, bam1_qname(bam_line),stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1);
360 for (im=0; im<ncig; im++)
362 uint8_t cread = bam1_seqi(read,iread);
363 uint8_t cref = stats->rseq_buf[iref];
369 int idx = is_fwd ? icycle : read_len-icycle-1;
370 if ( idx>stats->max_len )
371 error("mpc: %d>%d\n",idx,stats->max_len);
372 idx = idx*stats->nquals;
373 if ( idx>=stats->nquals*stats->nbases )
374 error("FIXME: mpc_buf overflow\n");
377 else if ( cref && cread && cref!=cread )
379 uint8_t qual = quals[iread] + 1;
380 if ( qual>=stats->nquals )
381 error("TODO: quality too high %d>=%d\n", quals[iread],stats->nquals);
383 int idx = is_fwd ? icycle : read_len-icycle-1;
384 if ( idx>stats->max_len )
385 error("mpc: %d>%d\n",idx,stats->max_len);
387 idx = idx*stats->nquals + qual;
388 if ( idx>=stats->nquals*stats->nbases )
389 error("FIXME: mpc_buf overflow\n");
400 void read_ref_seq(stats_t *stats,int32_t tid,int32_t pos)
406 faidx_t *fai = stats->fai;
409 chr = stats->sam->header->target_name[tid];
411 // ID of the sequence name
412 iter = kh_get(s, h, chr);
413 if (iter == kh_end(h))
414 error("No such reference sequence [%s]?\n", chr);
415 val = kh_value(h, iter);
417 // Check the boundaries
419 error("Was the bam file mapped with the reference sequence supplied?"
420 " A read mapped beyond the end of the chromosome (%s:%d, chromosome length %d).\n", chr,pos,val.len);
421 int size = stats->nref_seq;
422 // The buffer extends beyond the chromosome end. Later the rest will be filled with N's.
423 if (size+pos > val.len) size = val.len-pos;
425 // Position the razf reader
426 razf_seek(fai->rz, val.offset + pos / val.line_blen * val.line_len + pos % val.line_blen, SEEK_SET);
428 uint8_t *ptr = stats->rseq_buf;
430 while ( nread<size && razf_read(fai->rz,&c,1) && !fai->rz->z_err )
435 // Conversion between uint8_t coding and ACGT
438 if ( c=='A' || c=='a' )
440 else if ( c=='C' || c=='c' )
442 else if ( c=='G' || c=='g' )
444 else if ( c=='T' || c=='t' )
451 if ( nread < stats->nref_seq )
453 memset(ptr,0, stats->nref_seq - nread);
454 nread = stats->nref_seq;
456 stats->rseq_len = nread;
457 stats->rseq_pos = pos;
461 float fai_gc_content(stats_t *stats)
464 int i,size = stats->rseq_len;
468 for (i=0; i<size; i++)
470 c = stats->rseq_buf[i];
476 else if ( c==1 || c==8 )
479 return count ? (float)gc/count : 0;
483 void realloc_buffers(stats_t *stats, int seq_len)
485 int n = 2*(1 + seq_len - stats->nbases) + stats->nbases;
487 stats->quals_1st = realloc(stats->quals_1st, n*stats->nquals*sizeof(uint64_t));
488 if ( !stats->quals_1st )
489 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
490 memset(stats->quals_1st + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
492 stats->quals_2nd = realloc(stats->quals_2nd, n*stats->nquals*sizeof(uint64_t));
493 if ( !stats->quals_2nd )
494 error("Could not realloc buffers, the sequence too long: %d (2x%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
495 memset(stats->quals_2nd + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
497 if ( stats->mpc_buf )
499 stats->mpc_buf = realloc(stats->mpc_buf, n*stats->nquals*sizeof(uint64_t));
500 if ( !stats->mpc_buf )
501 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
502 memset(stats->mpc_buf + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
505 stats->acgt_cycles = realloc(stats->acgt_cycles, n*4*sizeof(uint64_t));
506 if ( !stats->acgt_cycles )
507 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*4*sizeof(uint64_t));
508 memset(stats->acgt_cycles + stats->nbases*4, 0, (n-stats->nbases)*4*sizeof(uint64_t));
510 stats->read_lengths = realloc(stats->read_lengths, n*sizeof(uint64_t));
511 if ( !stats->read_lengths )
512 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
513 memset(stats->read_lengths + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
515 stats->insertions = realloc(stats->insertions, n*sizeof(uint64_t));
516 if ( !stats->insertions )
517 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
518 memset(stats->insertions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
520 stats->deletions = realloc(stats->deletions, n*sizeof(uint64_t));
521 if ( !stats->deletions )
522 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
523 memset(stats->deletions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
525 stats->ins_cycles = realloc(stats->ins_cycles, (n+1)*sizeof(uint64_t));
526 if ( !stats->ins_cycles )
527 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
528 memset(stats->ins_cycles + stats->nbases + 1, 0, (n+1-stats->nbases)*sizeof(uint64_t));
530 stats->del_cycles = realloc(stats->del_cycles, (n+1)*sizeof(uint64_t));
531 if ( !stats->del_cycles )
532 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
533 memset(stats->del_cycles + stats->nbases + 1, 0, (n+1-stats->nbases)*sizeof(uint64_t));
537 // Realloc the coverage distribution buffer
538 int *rbuffer = calloc(sizeof(int),seq_len*5);
539 n = stats->cov_rbuf.size-stats->cov_rbuf.start;
540 memcpy(rbuffer,stats->cov_rbuf.buffer+stats->cov_rbuf.start,n);
541 if ( stats->cov_rbuf.start>1 )
542 memcpy(rbuffer+n,stats->cov_rbuf.buffer,stats->cov_rbuf.start);
543 stats->cov_rbuf.start = 0;
544 free(stats->cov_rbuf.buffer);
545 stats->cov_rbuf.buffer = rbuffer;
546 stats->cov_rbuf.size = seq_len*5;
549 void collect_stats(bam1_t *bam_line, stats_t *stats)
551 if ( stats->rmdup && IS_DUP(bam_line) )
554 if ( !is_in_regions(bam_line,stats) )
557 int seq_len = bam_line->core.l_qseq;
558 if ( !seq_len ) return;
559 if ( stats->filter_readlen!=-1 && seq_len!=stats->filter_readlen ) return;
560 if ( seq_len >= stats->nbases )
561 realloc_buffers(stats,seq_len);
562 if ( stats->max_len<seq_len )
563 stats->max_len = seq_len;
565 stats->read_lengths[seq_len]++;
567 // Count GC and ACGT per cycle
568 uint8_t base, *seq = bam1_seq(bam_line);
571 int reverse = IS_REVERSE(bam_line);
572 for (i=0; i<seq_len; i++)
574 // Conversion from uint8_t coding to ACGT
578 base = bam1_seqi(seq,i);
580 if ( base==1 || base==2 ) gc_count++;
581 else if ( base>2 ) base=3;
582 if ( 4*(reverse ? seq_len-i-1 : i) + base >= stats->nbases*4 )
583 error("FIXME: acgt_cycles\n");
584 stats->acgt_cycles[ 4*(reverse ? seq_len-i-1 : i) + base ]++;
586 int gc_idx_min = gc_count*(stats->ngc-1)/seq_len;
587 int gc_idx_max = (gc_count+1)*(stats->ngc-1)/seq_len;
588 if ( gc_idx_max >= stats->ngc ) gc_idx_max = stats->ngc - 1;
590 // Determine which array (1st or 2nd read) will these stats go to,
591 // trim low quality bases from end the same way BWA does,
594 uint8_t *bam_quals = bam1_qual(bam_line);
595 if ( bam_line->core.flag&BAM_FREAD2 )
597 quals = stats->quals_2nd;
599 for (i=gc_idx_min; i<gc_idx_max; i++)
604 quals = stats->quals_1st;
606 for (i=gc_idx_min; i<gc_idx_max; i++)
609 if ( stats->trim_qual>0 )
610 stats->nbases_trimmed += bwa_trim_read(stats->trim_qual, bam_quals, seq_len, reverse);
612 // Quality histogram and average quality
613 for (i=0; i<seq_len; i++)
615 uint8_t qual = bam_quals[ reverse ? seq_len-i-1 : i];
616 if ( qual>=stats->nquals )
617 error("TODO: quality too high %d>=%d\n", quals[i],stats->nquals);
618 if ( qual>stats->max_qual )
619 stats->max_qual = qual;
621 quals[ i*stats->nquals+qual ]++;
622 stats->sum_qual += qual;
625 // Look at the flags and increment appropriate counters (mapped, paired, etc)
626 if ( IS_UNMAPPED(bam_line) )
627 stats->nreads_unmapped++;
630 if ( !bam_line->core.qual )
633 count_indels(stats,bam_line);
635 // The insert size is tricky, because for long inserts the libraries are
636 // prepared differently and the pairs point in other direction. BWA does
637 // not set the paired flag for them. Similar thing is true also for 454
638 // reads. Therefore, do the insert size stats for all mapped reads.
639 int32_t isize = bam_line->core.isize;
640 if ( isize<0 ) isize = -isize;
641 if ( IS_PAIRED(bam_line) && isize!=0 )
643 stats->nreads_paired++;
644 if ( isize >= stats->nisize )
645 isize=stats->nisize-1;
647 int pos_fst = bam_line->core.mpos - bam_line->core.pos;
648 int is_fst = IS_READ1(bam_line) ? 1 : -1;
649 int is_fwd = IS_REVERSE(bam_line) ? -1 : 1;
650 int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1;
652 if ( is_fwd*is_mfwd>0 )
653 stats->isize_other[isize]++;
654 else if ( is_fst*pos_fst>0 )
656 if ( is_fst*is_fwd>0 )
657 stats->isize_inward[isize]++;
659 stats->isize_outward[isize]++;
661 else if ( is_fst*pos_fst<0 )
663 if ( is_fst*is_fwd>0 )
664 stats->isize_outward[isize]++;
666 stats->isize_inward[isize]++;
670 stats->nreads_unpaired++;
672 // Number of mismatches
673 uint8_t *nm = bam_aux_get(bam_line,"NM");
675 stats->nmismatches += bam_aux2i(nm);
677 // Number of mapped bases from cigar
678 // Conversion from uint32_t to MIDNSHP
681 if ( bam_line->core.n_cigar == 0)
682 error("FIXME: mapped read with no cigar?\n");
684 if ( stats->regions )
686 // Count only on-target bases
687 int iref = bam_line->core.pos + 1;
688 for (i=0; i<bam_line->core.n_cigar; i++)
690 int cig = bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK;
691 int ncig = bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
692 if ( cig==2 ) readlen += ncig;
695 if ( iref < stats->reg_from ) ncig -= stats->reg_from-iref;
696 else if ( iref+ncig-1 > stats->reg_to ) ncig -= iref+ncig-1 - stats->reg_to;
697 if ( ncig<0 ) ncig = 0;
698 stats->nbases_mapped_cigar += ncig;
699 iref += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
704 if ( iref>=stats->reg_from && iref<=stats->reg_to )
705 stats->nbases_mapped_cigar += ncig;
711 // Count the whole read
712 for (i=0; i<bam_line->core.n_cigar; i++)
714 if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==0 || (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==1 )
715 stats->nbases_mapped_cigar += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
716 if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==2 )
717 readlen += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
720 stats->nbases_mapped += seq_len;
722 if ( stats->tid==bam_line->core.tid && bam_line->core.pos<stats->pos )
723 stats->is_sorted = 0;
724 stats->pos = bam_line->core.pos;
726 if ( stats->is_sorted )
728 if ( stats->tid==-1 || stats->tid!=bam_line->core.tid )
729 round_buffer_flush(stats,-1);
731 // Mismatches per cycle and GC-depth graph
734 // First pass, new chromosome or sequence spanning beyond the end of the buffer
735 if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid || stats->rseq_pos+stats->rseq_len < bam_line->core.pos+readlen )
737 read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos);
739 // Get the reference GC content for this bin. Note that in the current implementation
740 // the GCD bins overlap by seq_len-1. Because the bin size is by default 20k and the
741 // expected read length only ~100bp, it shouldn't really matter.
742 stats->gcd_pos = bam_line->core.pos;
745 if ( stats->igcd >= stats->ngcd )
746 error("The genome too long or the GCD bin overlaps too big [%ud]\n", stats->igcd);
748 stats->gcd[ stats->igcd ].gc = fai_gc_content(stats);
750 count_mismatches_per_cycle(stats,bam_line);
752 else if ( stats->gcd_pos==-1 || stats->tid != bam_line->core.tid || bam_line->core.pos - stats->gcd_pos > stats->gcd_bin_size )
754 // First pass or a new chromosome
755 stats->tid = bam_line->core.tid;
756 stats->gcd_pos = bam_line->core.pos;
759 if ( stats->igcd >= stats->ngcd )
761 uint32_t n = 2*(1 + stats->ngcd);
762 stats->gcd = realloc(stats->gcd, n*sizeof(gc_depth_t));
764 error("Could not realloc GCD buffer, too many chromosomes or the genome too long?? [%u %u]\n", stats->ngcd,n);
765 memset(&(stats->gcd[stats->ngcd]),0,(n-stats->ngcd)*sizeof(gc_depth_t));
769 stats->gcd[ stats->igcd ].depth++;
770 // When no reference sequence is given, approximate the GC from the read (much shorter window, but otherwise OK)
772 stats->gcd[ stats->igcd ].gc += (float) gc_count / seq_len;
774 // Coverage distribution graph
775 round_buffer_flush(stats,bam_line->core.pos);
776 round_buffer_insert_read(&(stats->cov_rbuf),bam_line->core.pos,bam_line->core.pos+seq_len-1);
780 stats->total_len += seq_len;
781 if ( IS_DUP(bam_line) )
783 stats->total_len_dup += seq_len;
788 // Sort by GC and depth
789 #define GCD_t(x) ((gc_depth_t *)x)
790 static int gcd_cmp(const void *a, const void *b)
792 if ( GCD_t(a)->gc < GCD_t(b)->gc ) return -1;
793 if ( GCD_t(a)->gc > GCD_t(b)->gc ) return 1;
794 if ( GCD_t(a)->depth < GCD_t(b)->depth ) return -1;
795 if ( GCD_t(a)->depth > GCD_t(b)->depth ) return 1;
800 float gcd_percentile(gc_depth_t *gcd, int N, int p)
810 return gcd[N-1].depth;
813 return gcd[k-1].depth + d*(gcd[k].depth - gcd[k-1].depth);
816 void output_stats(stats_t *stats)
818 // Calculate average insert size and standard deviation (from the main bulk data only)
820 uint64_t nisize=0, nisize_inward=0, nisize_outward=0, nisize_other=0;
821 for (isize=1; isize<stats->nisize; isize++)
823 // Each pair was counted twice
824 stats->isize_inward[isize] *= 0.5;
825 stats->isize_outward[isize] *= 0.5;
826 stats->isize_other[isize] *= 0.5;
828 nisize_inward += stats->isize_inward[isize];
829 nisize_outward += stats->isize_outward[isize];
830 nisize_other += stats->isize_other[isize];
831 nisize += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
834 double bulk=0, avg_isize=0, sd_isize=0;
835 for (isize=1; isize<stats->nisize; isize++)
837 bulk += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
838 avg_isize += isize * (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]);
840 if ( bulk/nisize > stats->isize_main_bulk )
847 avg_isize /= nisize ? nisize : 1;
848 for (isize=1; isize<ibulk; isize++)
849 sd_isize += (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]) * (isize-avg_isize)*(isize-avg_isize) / nisize;
850 sd_isize = sqrt(sd_isize);
853 printf("# This file was produced by bamcheck (%s)\n",BAMCHECK_VERSION);
854 printf("# The command line was: %s",stats->argv[0]);
856 for (i=1; i<stats->argc; i++)
857 printf(" %s",stats->argv[i]);
859 printf("# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n");
860 printf("SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd));
861 printf("SN\tis paired:\t%d\n", stats->nreads_1st&&stats->nreads_2nd ? 1 : 0);
862 printf("SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0);
863 printf("SN\t1st fragments:\t%ld\n", (long)stats->nreads_1st);
864 printf("SN\tlast fragments:\t%ld\n", (long)stats->nreads_2nd);
865 printf("SN\treads mapped:\t%ld\n", (long)(stats->nreads_paired+stats->nreads_unpaired));
866 printf("SN\treads unmapped:\t%ld\n", (long)stats->nreads_unmapped);
867 printf("SN\treads unpaired:\t%ld\n", (long)stats->nreads_unpaired);
868 printf("SN\treads paired:\t%ld\n", (long)stats->nreads_paired);
869 printf("SN\treads duplicated:\t%ld\n", (long)stats->nreads_dup);
870 printf("SN\treads MQ0:\t%ld\n", (long)stats->nreads_mq0);
871 printf("SN\ttotal length:\t%ld\n", (long)stats->total_len);
872 printf("SN\tbases mapped:\t%ld\n", (long)stats->nbases_mapped);
873 printf("SN\tbases mapped (cigar):\t%ld\n", (long)stats->nbases_mapped_cigar);
874 printf("SN\tbases trimmed:\t%ld\n", (long)stats->nbases_trimmed);
875 printf("SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup);
876 printf("SN\tmismatches:\t%ld\n", (long)stats->nmismatches);
877 printf("SN\terror rate:\t%e\n", (float)stats->nmismatches/stats->nbases_mapped_cigar);
878 float avg_read_length = (stats->nreads_1st+stats->nreads_2nd)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd):0;
879 printf("SN\taverage length:\t%.0f\n", avg_read_length);
880 printf("SN\tmaximum length:\t%d\n", stats->max_len);
881 printf("SN\taverage quality:\t%.1f\n", stats->total_len?stats->sum_qual/stats->total_len:0);
882 printf("SN\tinsert size average:\t%.1f\n", avg_isize);
883 printf("SN\tinsert size standard deviation:\t%.1f\n", sd_isize);
884 printf("SN\tinward oriented pairs:\t%ld\n", (long)nisize_inward);
885 printf("SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward);
886 printf("SN\tpairs with other orientation:\t%ld\n", (long)nisize_other);
889 if ( stats->max_len<stats->nbases ) stats->max_len++;
890 if ( stats->max_qual+1<stats->nquals ) stats->max_qual++;
891 printf("# First Fragment Qualitites. Use `grep ^FFQ | cut -f 2-` to extract this part.\n");
892 printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
893 for (ibase=0; ibase<stats->max_len; ibase++)
895 printf("FFQ\t%d",ibase+1);
896 for (iqual=0; iqual<=stats->max_qual; iqual++)
898 printf("\t%ld", (long)stats->quals_1st[ibase*stats->nquals+iqual]);
902 printf("# Last Fragment Qualitites. Use `grep ^LFQ | cut -f 2-` to extract this part.\n");
903 printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
904 for (ibase=0; ibase<stats->max_len; ibase++)
906 printf("LFQ\t%d",ibase+1);
907 for (iqual=0; iqual<=stats->max_qual; iqual++)
909 printf("\t%ld", (long)stats->quals_2nd[ibase*stats->nquals+iqual]);
913 if ( stats->mpc_buf )
915 printf("# Mismatches per cycle and quality. Use `grep ^MPC | cut -f 2-` to extract this part.\n");
916 printf("# Columns correspond to qualities, rows to cycles. First column is the cycle number, second\n");
917 printf("# is the number of N's and the rest is the number of mismatches\n");
918 for (ibase=0; ibase<stats->max_len; ibase++)
920 printf("MPC\t%d",ibase+1);
921 for (iqual=0; iqual<=stats->max_qual; iqual++)
923 printf("\t%ld", (long)stats->mpc_buf[ibase*stats->nquals+iqual]);
928 printf("# GC Content of first fragments. Use `grep ^GCF | cut -f 2-` to extract this part.\n");
930 for (ibase=0; ibase<stats->ngc; ibase++)
932 if ( stats->gc_1st[ibase]==stats->gc_1st[ibase_prev] ) continue;
933 printf("GCF\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_1st[ibase_prev]);
936 printf("# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part.\n");
938 for (ibase=0; ibase<stats->ngc; ibase++)
940 if ( stats->gc_2nd[ibase]==stats->gc_2nd[ibase_prev] ) continue;
941 printf("GCL\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_2nd[ibase_prev]);
944 printf("# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle, and A,C,G,T counts [%%]\n");
945 for (ibase=0; ibase<stats->max_len; ibase++)
947 uint64_t *ptr = &(stats->acgt_cycles[ibase*4]);
948 uint64_t sum = ptr[0]+ptr[1]+ptr[2]+ptr[3];
949 if ( ! sum ) continue;
950 printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum);
952 printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n");
953 for (isize=1; isize<ibulk; isize++)
954 printf("IS\t%d\t%ld\t%ld\t%ld\t%ld\n", isize, (long)(stats->isize_inward[isize]+stats->isize_outward[isize]+stats->isize_other[isize]),
955 (long)stats->isize_inward[isize], (long)stats->isize_outward[isize], (long)stats->isize_other[isize]);
957 printf("# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count\n");
959 for (ilen=0; ilen<stats->max_len; ilen++)
961 if ( stats->read_lengths[ilen]>0 )
962 printf("RL\t%d\t%ld\n", ilen, (long)stats->read_lengths[ilen]);
965 printf("# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n");
966 for (ilen=0; ilen<stats->nindels; ilen++)
968 if ( stats->insertions[ilen]>0 || stats->deletions[ilen]>0 )
969 printf("ID\t%d\t%ld\t%ld\n", ilen+1, (long)stats->insertions[ilen], (long)stats->deletions[ilen]);
972 printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions, number of deletions\n");
973 for (ilen=0; ilen<=stats->nbases; ilen++)
975 if ( stats->ins_cycles[ilen]>0 || stats->del_cycles[ilen]>0 )
976 printf("IC\t%d\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles[ilen], (long)stats->del_cycles[ilen]);
979 printf("# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n");
980 printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1, (long)stats->cov[0]);
982 for (icov=1; icov<stats->ncov-1; icov++)
983 printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1, (long)stats->cov[icov]);
984 printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1, (long)stats->cov[stats->ncov-1]);
987 // Calculate average GC content, then sort by GC and depth
988 printf("# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n");
990 for (igcd=0; igcd<stats->igcd; igcd++)
993 stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc);
995 if ( stats->gcd[igcd].depth )
996 stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc / stats->gcd[igcd].depth);
998 qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp);
1000 while ( igcd < stats->igcd )
1002 // Calculate percentiles (10,25,50,75,90th) for the current GC content and print
1003 uint32_t nbins=0, itmp=igcd;
1004 float gc = stats->gcd[igcd].gc;
1005 while ( itmp<stats->igcd && fabs(stats->gcd[itmp].gc-gc)<0.1 )
1010 printf("GCD\t%.1f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", gc, (igcd+nbins+1)*100./(stats->igcd+1),
1011 gcd_percentile(&(stats->gcd[igcd]),nbins,10) *avg_read_length/stats->gcd_bin_size,
1012 gcd_percentile(&(stats->gcd[igcd]),nbins,25) *avg_read_length/stats->gcd_bin_size,
1013 gcd_percentile(&(stats->gcd[igcd]),nbins,50) *avg_read_length/stats->gcd_bin_size,
1014 gcd_percentile(&(stats->gcd[igcd]),nbins,75) *avg_read_length/stats->gcd_bin_size,
1015 gcd_percentile(&(stats->gcd[igcd]),nbins,90) *avg_read_length/stats->gcd_bin_size
1021 size_t getline(char **line, size_t *n, FILE *fp)
1023 if (line == NULL || n == NULL || fp == NULL)
1028 if (*n==0 || !*line)
1036 while ((c=getc(fp))!= EOF && c!='\n')
1041 *line = realloc(*line, sizeof(char)*(*n));
1043 (*line)[nread-1] = c;
1048 *line = realloc(*line, sizeof(char)*(*n));
1051 return nread>0 ? nread : -1;
1055 void init_regions(stats_t *stats, char *file)
1058 khash_t(str) *header_hash;
1060 bam_init_header_hash(stats->sam->header);
1061 header_hash = (khash_t(str)*)stats->sam->header->hash;
1063 FILE *fp = fopen(file,"r");
1064 if ( !fp ) error("%s: %s\n",file,strerror(errno));
1070 int prev_tid=-1, prev_pos=-1;
1071 while ((nread = getline(&line, &len, fp)) != -1)
1073 if ( line[0] == '#' ) continue;
1076 while ( i<nread && !isspace(line[i]) ) i++;
1077 if ( i>=nread ) error("Could not parse the file: %s [%s]\n", file,line);
1080 iter = kh_get(str, header_hash, line);
1081 int tid = kh_val(header_hash, iter);
1082 if ( iter == kh_end(header_hash) )
1085 fprintf(stderr,"Warning: Some sequences not present in the BAM (%s)\n", line);
1090 if ( tid >= stats->nregions )
1092 stats->regions = realloc(stats->regions,sizeof(regions_t)*(stats->nregions+100));
1094 for (j=stats->nregions; j<stats->nregions+100; j++)
1096 stats->regions[j].npos = stats->regions[j].mpos = stats->regions[j].cpos = 0;
1097 stats->regions[j].pos = NULL;
1099 stats->nregions += 100;
1101 int npos = stats->regions[tid].npos;
1102 if ( npos >= stats->regions[tid].mpos )
1104 stats->regions[tid].mpos += 1000;
1105 stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos);
1108 if ( (sscanf(line+i+1,"%d %d",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n");
1109 if ( prev_tid==-1 || prev_tid!=tid )
1112 prev_pos = stats->regions[tid].pos[npos].from;
1114 if ( prev_pos>stats->regions[tid].pos[npos].from )
1115 error("The positions are not in chromosomal order (%s:%d comes after %d)\n", line,stats->regions[tid].pos[npos].from,prev_pos);
1116 stats->regions[tid].npos++;
1118 if (line) free(line);
1122 void destroy_regions(stats_t *stats)
1125 for (i=0; i<stats->nregions; i++)
1127 if ( !stats->regions[i].mpos ) continue;
1128 free(stats->regions[i].pos);
1130 if ( stats->regions ) free(stats->regions);
1133 static int fetch_read(const bam1_t *bam_line, void *data)
1135 collect_stats((bam1_t*)bam_line,(stats_t*)data);
1139 void reset_regions(stats_t *stats)
1142 for (i=0; i<stats->nregions; i++)
1143 stats->regions[i].cpos = 0;
1146 int is_in_regions(bam1_t *bam_line, stats_t *stats)
1148 if ( !stats->regions ) return 1;
1150 if ( bam_line->core.tid >= stats->nregions || bam_line->core.tid<0 ) return 0;
1151 if ( !stats->is_sorted ) error("The BAM must be sorted in order for -t to work.\n");
1153 regions_t *reg = &stats->regions[bam_line->core.tid];
1154 if ( reg->cpos==reg->npos ) return 0; // done for this chr
1156 // Find a matching interval or skip this read. No splicing of reads is done, no indels or soft clips considered,
1157 // even small overlap is enough to include the read in the stats.
1159 while ( i<reg->npos && reg->pos[i].to<=bam_line->core.pos ) i++;
1160 if ( i>=reg->npos ) { reg->cpos = reg->npos; return 0; }
1161 if ( bam_line->core.pos + bam_line->core.l_qseq + 1 < reg->pos[i].from ) return 0;
1163 stats->reg_from = reg->pos[i].from;
1164 stats->reg_to = reg->pos[i].to;
1169 void error(const char *format, ...)
1173 printf("Version: %s\n", BAMCHECK_VERSION);
1174 printf("About: The program collects statistics from BAM files. The output can be visualized using plot-bamcheck.\n");
1175 printf("Usage: bamcheck [OPTIONS] file.bam\n");
1176 printf(" bamcheck [OPTIONS] file.bam chr:from-to\n");
1177 printf("Options:\n");
1178 printf(" -c, --coverage <int>,<int>,<int> Coverage distribution min,max,step [1,1000,1]\n");
1179 printf(" -d, --remove-dups Exlude from statistics reads marked as duplicates\n");
1180 printf(" -h, --help This help message\n");
1181 printf(" -i, --insert-size <int> Maximum insert size [8000]\n");
1182 printf(" -l, --read-length <int> Include in the statistics only reads with the given read length []\n");
1183 printf(" -m, --most-inserts <float> Report only the main part of inserts [0.99]\n");
1184 printf(" -q, --trim-quality <int> The BWA trimming parameter [0]\n");
1185 printf(" -r, --ref-seq <file> Reference sequence (required for GC-depth calculation).\n");
1186 printf(" -t, --target-regions <file> Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n");
1187 printf(" -s, --sam Input is SAM\n");
1193 va_start(ap, format);
1194 vfprintf(stderr, format, ap);
1200 int main(int argc, char *argv[])
1202 char *targets = NULL;
1203 char *bam_fname = NULL;
1204 samfile_t *sam = NULL;
1207 stats_t *stats = calloc(1,sizeof(stats_t));
1210 stats->nbases = 300;
1211 stats->nisize = 8000;
1212 stats->max_len = 30;
1213 stats->max_qual = 40;
1214 stats->isize_main_bulk = 0.99; // There are always outliers at the far end
1215 stats->gcd_bin_size = 20000;
1216 stats->ngcd = 3e5; // 300k of 20k bins is enough to hold a genome 6Gbp big
1217 stats->nref_seq = stats->gcd_bin_size;
1218 stats->rseq_pos = -1;
1219 stats->tid = stats->gcd_pos = -1;
1220 stats->is_sorted = 1;
1222 stats->cov_max = 1000;
1223 stats->cov_step = 1;
1226 stats->filter_readlen = -1;
1227 stats->nindels = stats->nbases;
1229 strcpy(in_mode, "rb");
1231 static struct option loptions[] =
1234 {"remove-dups",0,0,'d'},
1236 {"ref-seq",0,0,'r'},
1237 {"coverage",0,0,'c'},
1238 {"read-length",0,0,'l'},
1239 {"insert-size",0,0,'i'},
1240 {"most-inserts",0,0,'m'},
1241 {"trim-quality",0,0,'q'},
1242 {"target-regions",0,0,'t'},
1246 while ( (opt=getopt_long(argc,argv,"?hdsr:c:l:i:t:m:q:",loptions,NULL))>0 )
1250 case 'd': stats->rmdup=1; break;
1251 case 's': strcpy(in_mode, "r"); break;
1252 case 'r': stats->fai = fai_load(optarg);
1254 error("Could not load faidx: %s\n", optarg);
1256 case 'c': if ( sscanf(optarg,"%d,%d,%d",&stats->cov_min,&stats->cov_max,&stats->cov_step)!= 3 )
1257 error("Unable to parse -c %s\n", optarg);
1259 case 'l': stats->filter_readlen = atoi(optarg); break;
1260 case 'i': stats->nisize = atoi(optarg); break;
1261 case 'm': stats->isize_main_bulk = atof(optarg); break;
1262 case 'q': stats->trim_qual = atoi(optarg); break;
1263 case 't': targets = optarg; break;
1265 case 'h': error(NULL);
1266 default: error("Unknown argument: %s\n", optarg);
1270 bam_fname = argv[optind++];
1274 if ( isatty(fileno((FILE *)stdin)) )
1280 // .. coverage bins and round buffer
1281 if ( stats->cov_step > stats->cov_max - stats->cov_min + 1 )
1283 stats->cov_step = stats->cov_max - stats->cov_min;
1284 if ( stats->cov_step <= 0 )
1285 stats->cov_step = 1;
1287 stats->ncov = 3 + (stats->cov_max-stats->cov_min) / stats->cov_step;
1288 stats->cov_max = stats->cov_min + ((stats->cov_max-stats->cov_min)/stats->cov_step +1)*stats->cov_step - 1;
1289 stats->cov = calloc(sizeof(uint64_t),stats->ncov);
1290 stats->cov_rbuf.size = stats->nbases*5;
1291 stats->cov_rbuf.buffer = calloc(sizeof(int32_t),stats->cov_rbuf.size);
1293 if ((sam = samopen(bam_fname, in_mode, NULL)) == 0)
1294 error("Failed to open: %s\n", bam_fname);
1296 bam1_t *bam_line = bam_init1();
1298 stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
1299 stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
1300 stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t));
1301 stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t));
1302 stats->isize_inward = calloc(stats->nisize,sizeof(uint64_t));
1303 stats->isize_outward = calloc(stats->nisize,sizeof(uint64_t));
1304 stats->isize_other = calloc(stats->nisize,sizeof(uint64_t));
1305 stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t));
1306 stats->rseq_buf = calloc(stats->nref_seq,sizeof(uint8_t));
1307 stats->mpc_buf = stats->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL;
1308 stats->acgt_cycles = calloc(4*stats->nbases,sizeof(uint64_t));
1309 stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t));
1310 stats->insertions = calloc(stats->nbases,sizeof(uint64_t));
1311 stats->deletions = calloc(stats->nbases,sizeof(uint64_t));
1312 stats->ins_cycles = calloc(stats->nbases+1,sizeof(uint64_t));
1313 stats->del_cycles = calloc(stats->nbases+1,sizeof(uint64_t));
1315 init_regions(stats, targets);
1317 // Collect statistics
1320 // Collect stats in selected regions only
1321 bam_index_t *bam_idx = bam_index_load(bam_fname);
1323 error("Random alignment retrieval only works for indexed BAM files.\n");
1326 for (i=optind; i<argc; i++)
1329 bam_parse_region(stats->sam->header, argv[i], &tid, &beg, &end);
1330 if ( tid < 0 ) continue;
1331 reset_regions(stats);
1332 bam_fetch(stats->sam->x.bam, bam_idx, tid, beg, end, stats, fetch_read);
1334 bam_index_destroy(bam_idx);
1338 // Stream through the entire BAM ignoring off-target regions if -t is given
1339 while (samread(sam,bam_line) >= 0)
1340 collect_stats(bam_line,stats);
1342 round_buffer_flush(stats,-1);
1344 output_stats(stats);
1346 bam_destroy1(bam_line);
1347 samclose(stats->sam);
1348 if (stats->fai) fai_destroy(stats->fai);
1349 free(stats->cov_rbuf.buffer); free(stats->cov);
1350 free(stats->quals_1st); free(stats->quals_2nd);
1351 free(stats->gc_1st); free(stats->gc_2nd);
1352 free(stats->isize_inward); free(stats->isize_outward); free(stats->isize_other);
1354 free(stats->rseq_buf);
1355 free(stats->mpc_buf);
1356 free(stats->acgt_cycles);
1357 free(stats->read_lengths);
1358 free(stats->insertions);
1359 free(stats->deletions);
1360 free(stats->ins_cycles);
1361 free(stats->del_cycles);
1362 destroy_regions(stats);