2 Author: petr.danecek@sanger
3 gcc -Wall -Winline -g -O2 -I ~/git/samtools bamcheck.c -o bamcheck -lm -lz -L ~/git/samtools -lbam
5 Assumptions, approximations and other issues:
6 - GC-depth graph does not split reads, the starting position determines which bin is incremented.
7 There are small overlaps between bins (max readlen-1). However, the bins are big (20k).
8 - coverage distribution ignores softclips and deletions
9 - some stats require sorted BAMs
10 - GC content graph can have an untidy, step-like pattern when BAM contains multiple read lengths.
11 - 'bases mapped' (stats->nbases_mapped) is calculated from read lengths given by BAM (core.l_qseq)
12 - With the -t option, the whole reads are used. Except for the number of mapped bases (cigar)
13 counts, no splicing is done, no indels or soft clips are considered, even small overlap is
14 good enough to include the read in the stats.
18 #define BAMCHECK_VERSION "2012-04-24"
20 #define _ISOC99_SOURCE
34 #define BWA_MIN_RDLEN 35
35 #define IS_PAIRED(bam) ((bam)->core.flag&BAM_FPAIRED && !((bam)->core.flag&BAM_FUNMAP) && !((bam)->core.flag&BAM_FMUNMAP))
36 #define IS_UNMAPPED(bam) ((bam)->core.flag&BAM_FUNMAP)
37 #define IS_REVERSE(bam) ((bam)->core.flag&BAM_FREVERSE)
38 #define IS_MATE_REVERSE(bam) ((bam)->core.flag&BAM_FMREVERSE)
39 #define IS_READ1(bam) ((bam)->core.flag&BAM_FREAD1)
40 #define IS_READ2(bam) ((bam)->core.flag&BAM_FREAD2)
41 #define IS_DUP(bam) ((bam)->core.flag&BAM_FDUP)
45 int32_t line_len, line_blen;
50 KHASH_MAP_INIT_STR(s, faidx1_t)
51 KHASH_MAP_INIT_STR(str, int)
66 // For coverage distribution, a simple pileup
75 typedef struct { uint32_t from, to; } pos_t;
86 int trim_qual; // bwa trim quality
88 // Dimensions of the quality histogram holder (quals_1st,quals_2nd), GC content holder (gc_1st,gc_2nd),
89 // insert size histogram holder
90 int nquals; // The number of quality bins
91 int nbases; // The maximum sequence length the allocated array can hold
92 int nisize; // The maximum insert size that the allocated array can hold
93 int ngc; // The size of gc_1st and gc_2nd
94 int nindels; // The maximum indel length for indel distribution
96 // Arrays for the histogram data
97 uint64_t *quals_1st, *quals_2nd;
98 uint64_t *gc_1st, *gc_2nd;
99 uint64_t *isize_inward, *isize_outward, *isize_other;
100 uint64_t *acgt_cycles;
101 uint64_t *read_lengths;
102 uint64_t *insertions, *deletions;
103 uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd;
105 // The extremes encountered
106 int max_len; // Maximum read length
107 int max_qual; // Maximum quality
108 float isize_main_bulk; // There are always some unrealistically big insert sizes, report only the main part
113 uint64_t total_len_dup;
117 uint64_t nreads_unmapped;
118 uint64_t nreads_unpaired;
119 uint64_t nreads_paired;
121 uint64_t nbases_mapped;
122 uint64_t nbases_mapped_cigar;
123 uint64_t nbases_trimmed; // bwa trimmed bases
124 uint64_t nmismatches;
126 // GC-depth related data
127 uint32_t ngcd, igcd; // The maximum number of GC depth bins and index of the current bin
128 gc_depth_t *gcd; // The GC-depth bins holder
129 int gcd_bin_size; // The size of GC-depth bin
130 int32_t tid, gcd_pos; // Position of the current bin
131 int32_t pos; // Position of the last read
133 // Coverage distribution related data
134 int ncov; // The number of coverage bins
135 uint64_t *cov; // The coverage frequencies
136 int cov_min,cov_max,cov_step; // Minimum, maximum coverage and size of the coverage bins
137 round_buffer_t cov_rbuf; // Pileup round buffer
139 // Mismatches by read cycle
140 uint8_t *rseq_buf; // A buffer for reference sequence to check the mismatches against
141 int nref_seq; // The size of the buffer
142 int32_t rseq_pos; // The coordinate of the first base in the buffer
143 int32_t rseq_len; // The used part of the buffer
144 uint64_t *mpc_buf; // Mismatches per cycle
150 int nregions, reg_from,reg_to;
154 int flag_require, flag_filter;
155 double sum_qual; // For calculating average quality value
156 samfile_t *sam; // Unused
157 faidx_t *fai; // Reference sequence for GC-depth graph
158 int argc; // Command line arguments to be printed on the output
163 void error(const char *format, ...);
164 void bam_init_header_hash(bam_header_t *header);
165 int is_in_regions(bam1_t *bam_line, stats_t *stats);
168 // Coverage distribution methods
169 inline int coverage_idx(int min, int max, int n, int step, int depth)
177 return 1 + (depth - min) / step;
180 inline int round_buffer_lidx2ridx(int offset, int size, int64_t refpos, int64_t pos)
182 return (offset + (pos-refpos) % size) % size;
185 void round_buffer_flush(stats_t *stats, int64_t pos)
189 if ( pos==stats->cov_rbuf.pos )
192 int64_t new_pos = pos;
193 if ( pos==-1 || pos - stats->cov_rbuf.pos >= stats->cov_rbuf.size )
195 // Flush the whole buffer, but in sequential order,
196 pos = stats->cov_rbuf.pos + stats->cov_rbuf.size - 1;
199 if ( pos < stats->cov_rbuf.pos )
200 error("Expected coordinates in ascending order, got %ld after %ld\n", pos,stats->cov_rbuf.pos);
202 int ifrom = stats->cov_rbuf.start;
203 int ito = round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos-1);
206 for (ibuf=ifrom; ibuf<stats->cov_rbuf.size; ibuf++)
208 if ( !stats->cov_rbuf.buffer[ibuf] )
210 idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
212 stats->cov_rbuf.buffer[ibuf] = 0;
216 for (ibuf=ifrom; ibuf<=ito; ibuf++)
218 if ( !stats->cov_rbuf.buffer[ibuf] )
220 idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
222 stats->cov_rbuf.buffer[ibuf] = 0;
224 stats->cov_rbuf.start = (new_pos==-1) ? 0 : round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos);
225 stats->cov_rbuf.pos = new_pos;
228 void round_buffer_insert_read(round_buffer_t *rbuf, int64_t from, int64_t to)
230 if ( to-from >= rbuf->size )
231 error("The read length too big (%d), please increase the buffer length (currently %d)\n", to-from+1,rbuf->size);
232 if ( from < rbuf->pos )
233 error("The reads are not sorted (%ld comes after %ld).\n", from,rbuf->pos);
236 ifrom = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,from);
237 ito = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,to);
240 for (ibuf=ifrom; ibuf<rbuf->size; ibuf++)
241 rbuf->buffer[ibuf]++;
244 for (ibuf=ifrom; ibuf<=ito; ibuf++)
245 rbuf->buffer[ibuf]++;
248 // Calculate the number of bases in the read trimmed by BWA
249 int bwa_trim_read(int trim_qual, uint8_t *quals, int len, int reverse)
251 if ( len<BWA_MIN_RDLEN ) return 0;
253 // Although the name implies that the read cannot be trimmed to more than BWA_MIN_RDLEN,
254 // the calculation can in fact trim it to (BWA_MIN_RDLEN-1). (bwa_trim_read in bwa/bwaseqio.c).
255 int max_trimmed = len - BWA_MIN_RDLEN + 1;
256 int l, sum=0, max_sum=0, max_l=0;
258 for (l=0; l<max_trimmed; l++)
260 sum += trim_qual - quals[ reverse ? l : len-1-l ];
265 // This is the correct way, but bwa clips from some reason one base less
274 void count_indels(stats_t *stats,bam1_t *bam_line)
276 int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
277 int is_1st = IS_READ1(bam_line) ? 1 : 0;
280 int read_len = bam_line->core.l_qseq;
281 for (icig=0; icig<bam_line->core.n_cigar; icig++)
283 // Conversion from uint32_t to MIDNSHP
286 int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
287 int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
291 int idx = is_fwd ? icycle : read_len-icycle;
292 if ( idx<0 ) error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle);
293 if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d\n", idx,stats->nbases);
295 stats->ins_cycles_1st[idx]++;
297 stats->ins_cycles_2nd[idx]++;
299 if ( ncig<=stats->nindels )
300 stats->insertions[ncig-1]++;
305 int idx = is_fwd ? icycle-1 : read_len-icycle-1;
306 if ( idx<0 ) continue; // discard meaningless deletions
307 if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases);
309 stats->del_cycles_1st[idx]++;
311 stats->del_cycles_2nd[idx]++;
312 if ( ncig<=stats->nindels )
313 stats->deletions[ncig-1]++;
320 void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line)
322 int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
323 int icig,iread=0,icycle=0;
324 int iref = bam_line->core.pos - stats->rseq_pos;
325 int read_len = bam_line->core.l_qseq;
326 uint8_t *read = bam1_seq(bam_line);
327 uint8_t *quals = bam1_qual(bam_line);
328 uint64_t *mpc_buf = stats->mpc_buf;
329 for (icig=0; icig<bam_line->core.n_cigar; icig++)
331 // Conversion from uint32_t to MIDNSHP
334 int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
335 int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
350 // Soft-clips are present in the sequence, but the position of the read marks a start of non-clipped sequence
361 error("TODO: cigar %d, %s\n", cig,bam1_qname(bam_line));
363 if ( ncig+iref > stats->rseq_len )
364 error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->rseq_len, bam1_qname(bam_line),stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1);
367 for (im=0; im<ncig; im++)
369 uint8_t cread = bam1_seqi(read,iread);
370 uint8_t cref = stats->rseq_buf[iref];
376 int idx = is_fwd ? icycle : read_len-icycle-1;
377 if ( idx>stats->max_len )
378 error("mpc: %d>%d\n",idx,stats->max_len);
379 idx = idx*stats->nquals;
380 if ( idx>=stats->nquals*stats->nbases )
381 error("FIXME: mpc_buf overflow\n");
384 else if ( cref && cread && cref!=cread )
386 uint8_t qual = quals[iread] + 1;
387 if ( qual>=stats->nquals )
388 error("TODO: quality too high %d>=%d\n", quals[iread],stats->nquals);
390 int idx = is_fwd ? icycle : read_len-icycle-1;
391 if ( idx>stats->max_len )
392 error("mpc: %d>%d\n",idx,stats->max_len);
394 idx = idx*stats->nquals + qual;
395 if ( idx>=stats->nquals*stats->nbases )
396 error("FIXME: mpc_buf overflow\n");
407 void read_ref_seq(stats_t *stats,int32_t tid,int32_t pos)
413 faidx_t *fai = stats->fai;
416 chr = stats->sam->header->target_name[tid];
418 // ID of the sequence name
419 iter = kh_get(s, h, chr);
420 if (iter == kh_end(h))
421 error("No such reference sequence [%s]?\n", chr);
422 val = kh_value(h, iter);
424 // Check the boundaries
426 error("Was the bam file mapped with the reference sequence supplied?"
427 " A read mapped beyond the end of the chromosome (%s:%d, chromosome length %d).\n", chr,pos,val.len);
428 int size = stats->nref_seq;
429 // The buffer extends beyond the chromosome end. Later the rest will be filled with N's.
430 if (size+pos > val.len) size = val.len-pos;
432 // Position the razf reader
433 razf_seek(fai->rz, val.offset + pos / val.line_blen * val.line_len + pos % val.line_blen, SEEK_SET);
435 uint8_t *ptr = stats->rseq_buf;
437 while ( nread<size && razf_read(fai->rz,&c,1) && !fai->rz->z_err )
442 // Conversion between uint8_t coding and ACGT
445 if ( c=='A' || c=='a' )
447 else if ( c=='C' || c=='c' )
449 else if ( c=='G' || c=='g' )
451 else if ( c=='T' || c=='t' )
458 if ( nread < stats->nref_seq )
460 memset(ptr,0, stats->nref_seq - nread);
461 nread = stats->nref_seq;
463 stats->rseq_len = nread;
464 stats->rseq_pos = pos;
468 float fai_gc_content(stats_t *stats)
471 int i,size = stats->rseq_len;
475 for (i=0; i<size; i++)
477 c = stats->rseq_buf[i];
483 else if ( c==1 || c==8 )
486 return count ? (float)gc/count : 0;
490 void realloc_buffers(stats_t *stats, int seq_len)
492 int n = 2*(1 + seq_len - stats->nbases) + stats->nbases;
494 stats->quals_1st = realloc(stats->quals_1st, n*stats->nquals*sizeof(uint64_t));
495 if ( !stats->quals_1st )
496 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
497 memset(stats->quals_1st + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
499 stats->quals_2nd = realloc(stats->quals_2nd, n*stats->nquals*sizeof(uint64_t));
500 if ( !stats->quals_2nd )
501 error("Could not realloc buffers, the sequence too long: %d (2x%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
502 memset(stats->quals_2nd + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
504 if ( stats->mpc_buf )
506 stats->mpc_buf = realloc(stats->mpc_buf, n*stats->nquals*sizeof(uint64_t));
507 if ( !stats->mpc_buf )
508 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
509 memset(stats->mpc_buf + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
512 stats->acgt_cycles = realloc(stats->acgt_cycles, n*4*sizeof(uint64_t));
513 if ( !stats->acgt_cycles )
514 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*4*sizeof(uint64_t));
515 memset(stats->acgt_cycles + stats->nbases*4, 0, (n-stats->nbases)*4*sizeof(uint64_t));
517 stats->read_lengths = realloc(stats->read_lengths, n*sizeof(uint64_t));
518 if ( !stats->read_lengths )
519 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
520 memset(stats->read_lengths + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
522 stats->insertions = realloc(stats->insertions, n*sizeof(uint64_t));
523 if ( !stats->insertions )
524 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
525 memset(stats->insertions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
527 stats->deletions = realloc(stats->deletions, n*sizeof(uint64_t));
528 if ( !stats->deletions )
529 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
530 memset(stats->deletions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
532 stats->ins_cycles_1st = realloc(stats->ins_cycles_1st, (n+1)*sizeof(uint64_t));
533 if ( !stats->ins_cycles_1st )
534 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
535 memset(stats->ins_cycles_1st + stats->nbases + 1, 0, (n+1-stats->nbases)*sizeof(uint64_t));
537 stats->ins_cycles_2nd = realloc(stats->ins_cycles_2nd, (n+1)*sizeof(uint64_t));
538 if ( !stats->ins_cycles_2nd )
539 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
540 memset(stats->ins_cycles_2nd + stats->nbases + 1, 0, (n+1-stats->nbases)*sizeof(uint64_t));
542 stats->del_cycles_1st = realloc(stats->del_cycles_1st, (n+1)*sizeof(uint64_t));
543 if ( !stats->del_cycles_1st )
544 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
545 memset(stats->del_cycles_1st + stats->nbases + 1, 0, (n+1-stats->nbases)*sizeof(uint64_t));
547 stats->del_cycles_2nd = realloc(stats->del_cycles_2nd, (n+1)*sizeof(uint64_t));
548 if ( !stats->del_cycles_2nd )
549 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
550 memset(stats->del_cycles_2nd + stats->nbases + 1, 0, (n+1-stats->nbases)*sizeof(uint64_t));
554 // Realloc the coverage distribution buffer
555 int *rbuffer = calloc(sizeof(int),seq_len*5);
556 n = stats->cov_rbuf.size-stats->cov_rbuf.start;
557 memcpy(rbuffer,stats->cov_rbuf.buffer+stats->cov_rbuf.start,n);
558 if ( stats->cov_rbuf.start>1 )
559 memcpy(rbuffer+n,stats->cov_rbuf.buffer,stats->cov_rbuf.start);
560 stats->cov_rbuf.start = 0;
561 free(stats->cov_rbuf.buffer);
562 stats->cov_rbuf.buffer = rbuffer;
563 stats->cov_rbuf.size = seq_len*5;
566 void collect_stats(bam1_t *bam_line, stats_t *stats)
568 if ( stats->flag_require && (bam_line->core.flag & stats->flag_require)!=stats->flag_require )
570 if ( stats->flag_filter && (bam_line->core.flag & stats->flag_filter) )
573 if ( !is_in_regions(bam_line,stats) )
576 int seq_len = bam_line->core.l_qseq;
577 if ( !seq_len ) return;
578 if ( stats->filter_readlen!=-1 && seq_len!=stats->filter_readlen ) return;
579 if ( seq_len >= stats->nbases )
580 realloc_buffers(stats,seq_len);
581 if ( stats->max_len<seq_len )
582 stats->max_len = seq_len;
584 stats->read_lengths[seq_len]++;
586 // Count GC and ACGT per cycle
587 uint8_t base, *seq = bam1_seq(bam_line);
590 int reverse = IS_REVERSE(bam_line);
591 for (i=0; i<seq_len; i++)
593 // Conversion from uint8_t coding to ACGT
597 base = bam1_seqi(seq,i);
599 if ( base==1 || base==2 ) gc_count++;
600 else if ( base>2 ) base=3;
601 if ( 4*(reverse ? seq_len-i-1 : i) + base >= stats->nbases*4 )
602 error("FIXME: acgt_cycles\n");
603 stats->acgt_cycles[ 4*(reverse ? seq_len-i-1 : i) + base ]++;
605 int gc_idx_min = gc_count*(stats->ngc-1)/seq_len;
606 int gc_idx_max = (gc_count+1)*(stats->ngc-1)/seq_len;
607 if ( gc_idx_max >= stats->ngc ) gc_idx_max = stats->ngc - 1;
609 // Determine which array (1st or 2nd read) will these stats go to,
610 // trim low quality bases from end the same way BWA does,
613 uint8_t *bam_quals = bam1_qual(bam_line);
614 if ( bam_line->core.flag&BAM_FREAD2 )
616 quals = stats->quals_2nd;
618 for (i=gc_idx_min; i<gc_idx_max; i++)
623 quals = stats->quals_1st;
625 for (i=gc_idx_min; i<gc_idx_max; i++)
628 if ( stats->trim_qual>0 )
629 stats->nbases_trimmed += bwa_trim_read(stats->trim_qual, bam_quals, seq_len, reverse);
631 // Quality histogram and average quality
632 for (i=0; i<seq_len; i++)
634 uint8_t qual = bam_quals[ reverse ? seq_len-i-1 : i];
635 if ( qual>=stats->nquals )
636 error("TODO: quality too high %d>=%d\n", quals[i],stats->nquals);
637 if ( qual>stats->max_qual )
638 stats->max_qual = qual;
640 quals[ i*stats->nquals+qual ]++;
641 stats->sum_qual += qual;
644 // Look at the flags and increment appropriate counters (mapped, paired, etc)
645 if ( IS_UNMAPPED(bam_line) )
646 stats->nreads_unmapped++;
649 if ( !bam_line->core.qual )
652 count_indels(stats,bam_line);
654 // The insert size is tricky, because for long inserts the libraries are
655 // prepared differently and the pairs point in other direction. BWA does
656 // not set the paired flag for them. Similar thing is true also for 454
657 // reads. Therefore, do the insert size stats for all mapped reads.
658 int32_t isize = bam_line->core.isize;
659 if ( isize<0 ) isize = -isize;
660 if ( IS_PAIRED(bam_line) && isize!=0 )
662 stats->nreads_paired++;
663 if ( isize >= stats->nisize )
664 isize=stats->nisize-1;
666 int pos_fst = bam_line->core.mpos - bam_line->core.pos;
667 int is_fst = IS_READ1(bam_line) ? 1 : -1;
668 int is_fwd = IS_REVERSE(bam_line) ? -1 : 1;
669 int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1;
671 if ( is_fwd*is_mfwd>0 )
672 stats->isize_other[isize]++;
673 else if ( is_fst*pos_fst>0 )
675 if ( is_fst*is_fwd>0 )
676 stats->isize_inward[isize]++;
678 stats->isize_outward[isize]++;
680 else if ( is_fst*pos_fst<0 )
682 if ( is_fst*is_fwd>0 )
683 stats->isize_outward[isize]++;
685 stats->isize_inward[isize]++;
689 stats->nreads_unpaired++;
691 // Number of mismatches
692 uint8_t *nm = bam_aux_get(bam_line,"NM");
694 stats->nmismatches += bam_aux2i(nm);
696 // Number of mapped bases from cigar
697 // Conversion from uint32_t to MIDNSHP
700 if ( bam_line->core.n_cigar == 0)
701 error("FIXME: mapped read with no cigar?\n");
703 if ( stats->regions )
705 // Count only on-target bases
706 int iref = bam_line->core.pos + 1;
707 for (i=0; i<bam_line->core.n_cigar; i++)
709 int cig = bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK;
710 int ncig = bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
711 if ( cig==2 ) readlen += ncig;
714 if ( iref < stats->reg_from ) ncig -= stats->reg_from-iref;
715 else if ( iref+ncig-1 > stats->reg_to ) ncig -= iref+ncig-1 - stats->reg_to;
716 if ( ncig<0 ) ncig = 0;
717 stats->nbases_mapped_cigar += ncig;
718 iref += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
723 if ( iref>=stats->reg_from && iref<=stats->reg_to )
724 stats->nbases_mapped_cigar += ncig;
730 // Count the whole read
731 for (i=0; i<bam_line->core.n_cigar; i++)
733 if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==0 || (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==1 )
734 stats->nbases_mapped_cigar += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
735 if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==2 )
736 readlen += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
739 stats->nbases_mapped += seq_len;
741 if ( stats->tid==bam_line->core.tid && bam_line->core.pos<stats->pos )
742 stats->is_sorted = 0;
743 stats->pos = bam_line->core.pos;
745 if ( stats->is_sorted )
747 if ( stats->tid==-1 || stats->tid!=bam_line->core.tid )
748 round_buffer_flush(stats,-1);
750 // Mismatches per cycle and GC-depth graph
753 // First pass, new chromosome or sequence spanning beyond the end of the buffer
754 if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid || stats->rseq_pos+stats->rseq_len < bam_line->core.pos+readlen )
756 read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos);
758 // Get the reference GC content for this bin. Note that in the current implementation
759 // the GCD bins overlap by seq_len-1. Because the bin size is by default 20k and the
760 // expected read length only ~100bp, it shouldn't really matter.
761 stats->gcd_pos = bam_line->core.pos;
764 if ( stats->igcd >= stats->ngcd )
765 error("The genome too long or the GCD bin overlaps too big [%ud]\n", stats->igcd);
767 stats->gcd[ stats->igcd ].gc = fai_gc_content(stats);
769 count_mismatches_per_cycle(stats,bam_line);
771 else if ( stats->gcd_pos==-1 || stats->tid != bam_line->core.tid || bam_line->core.pos - stats->gcd_pos > stats->gcd_bin_size )
773 // First pass or a new chromosome
774 stats->tid = bam_line->core.tid;
775 stats->gcd_pos = bam_line->core.pos;
778 if ( stats->igcd >= stats->ngcd )
780 uint32_t n = 2*(1 + stats->ngcd);
781 stats->gcd = realloc(stats->gcd, n*sizeof(gc_depth_t));
783 error("Could not realloc GCD buffer, too many chromosomes or the genome too long?? [%u %u]\n", stats->ngcd,n);
784 memset(&(stats->gcd[stats->ngcd]),0,(n-stats->ngcd)*sizeof(gc_depth_t));
788 stats->gcd[ stats->igcd ].depth++;
789 // When no reference sequence is given, approximate the GC from the read (much shorter window, but otherwise OK)
791 stats->gcd[ stats->igcd ].gc += (float) gc_count / seq_len;
793 // Coverage distribution graph
794 round_buffer_flush(stats,bam_line->core.pos);
795 round_buffer_insert_read(&(stats->cov_rbuf),bam_line->core.pos,bam_line->core.pos+seq_len-1);
799 stats->total_len += seq_len;
800 if ( IS_DUP(bam_line) )
802 stats->total_len_dup += seq_len;
807 // Sort by GC and depth
808 #define GCD_t(x) ((gc_depth_t *)x)
809 static int gcd_cmp(const void *a, const void *b)
811 if ( GCD_t(a)->gc < GCD_t(b)->gc ) return -1;
812 if ( GCD_t(a)->gc > GCD_t(b)->gc ) return 1;
813 if ( GCD_t(a)->depth < GCD_t(b)->depth ) return -1;
814 if ( GCD_t(a)->depth > GCD_t(b)->depth ) return 1;
819 float gcd_percentile(gc_depth_t *gcd, int N, int p)
829 return gcd[N-1].depth;
832 return gcd[k-1].depth + d*(gcd[k].depth - gcd[k-1].depth);
835 void output_stats(stats_t *stats)
837 // Calculate average insert size and standard deviation (from the main bulk data only)
839 uint64_t nisize=0, nisize_inward=0, nisize_outward=0, nisize_other=0;
840 for (isize=1; isize<stats->nisize; isize++)
842 // Each pair was counted twice
843 stats->isize_inward[isize] *= 0.5;
844 stats->isize_outward[isize] *= 0.5;
845 stats->isize_other[isize] *= 0.5;
847 nisize_inward += stats->isize_inward[isize];
848 nisize_outward += stats->isize_outward[isize];
849 nisize_other += stats->isize_other[isize];
850 nisize += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
853 double bulk=0, avg_isize=0, sd_isize=0;
854 for (isize=1; isize<stats->nisize; isize++)
856 bulk += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
857 avg_isize += isize * (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]);
859 if ( bulk/nisize > stats->isize_main_bulk )
866 avg_isize /= nisize ? nisize : 1;
867 for (isize=1; isize<ibulk; isize++)
868 sd_isize += (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]) * (isize-avg_isize)*(isize-avg_isize) / nisize;
869 sd_isize = sqrt(sd_isize);
872 printf("# This file was produced by bamcheck (%s)\n",BAMCHECK_VERSION);
873 printf("# The command line was: %s",stats->argv[0]);
875 for (i=1; i<stats->argc; i++)
876 printf(" %s",stats->argv[i]);
878 printf("# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n");
879 printf("SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd));
880 printf("SN\tis paired:\t%d\n", stats->nreads_1st&&stats->nreads_2nd ? 1 : 0);
881 printf("SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0);
882 printf("SN\t1st fragments:\t%ld\n", (long)stats->nreads_1st);
883 printf("SN\tlast fragments:\t%ld\n", (long)stats->nreads_2nd);
884 printf("SN\treads mapped:\t%ld\n", (long)(stats->nreads_paired+stats->nreads_unpaired));
885 printf("SN\treads unmapped:\t%ld\n", (long)stats->nreads_unmapped);
886 printf("SN\treads unpaired:\t%ld\n", (long)stats->nreads_unpaired);
887 printf("SN\treads paired:\t%ld\n", (long)stats->nreads_paired);
888 printf("SN\treads duplicated:\t%ld\n", (long)stats->nreads_dup);
889 printf("SN\treads MQ0:\t%ld\n", (long)stats->nreads_mq0);
890 printf("SN\ttotal length:\t%ld\n", (long)stats->total_len);
891 printf("SN\tbases mapped:\t%ld\n", (long)stats->nbases_mapped);
892 printf("SN\tbases mapped (cigar):\t%ld\n", (long)stats->nbases_mapped_cigar);
893 printf("SN\tbases trimmed:\t%ld\n", (long)stats->nbases_trimmed);
894 printf("SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup);
895 printf("SN\tmismatches:\t%ld\n", (long)stats->nmismatches);
896 printf("SN\terror rate:\t%e\n", (float)stats->nmismatches/stats->nbases_mapped_cigar);
897 float avg_read_length = (stats->nreads_1st+stats->nreads_2nd)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd):0;
898 printf("SN\taverage length:\t%.0f\n", avg_read_length);
899 printf("SN\tmaximum length:\t%d\n", stats->max_len);
900 printf("SN\taverage quality:\t%.1f\n", stats->total_len?stats->sum_qual/stats->total_len:0);
901 printf("SN\tinsert size average:\t%.1f\n", avg_isize);
902 printf("SN\tinsert size standard deviation:\t%.1f\n", sd_isize);
903 printf("SN\tinward oriented pairs:\t%ld\n", (long)nisize_inward);
904 printf("SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward);
905 printf("SN\tpairs with other orientation:\t%ld\n", (long)nisize_other);
908 if ( stats->max_len<stats->nbases ) stats->max_len++;
909 if ( stats->max_qual+1<stats->nquals ) stats->max_qual++;
910 printf("# First Fragment Qualitites. Use `grep ^FFQ | cut -f 2-` to extract this part.\n");
911 printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
912 for (ibase=0; ibase<stats->max_len; ibase++)
914 printf("FFQ\t%d",ibase+1);
915 for (iqual=0; iqual<=stats->max_qual; iqual++)
917 printf("\t%ld", (long)stats->quals_1st[ibase*stats->nquals+iqual]);
921 printf("# Last Fragment Qualitites. Use `grep ^LFQ | cut -f 2-` to extract this part.\n");
922 printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
923 for (ibase=0; ibase<stats->max_len; ibase++)
925 printf("LFQ\t%d",ibase+1);
926 for (iqual=0; iqual<=stats->max_qual; iqual++)
928 printf("\t%ld", (long)stats->quals_2nd[ibase*stats->nquals+iqual]);
932 if ( stats->mpc_buf )
934 printf("# Mismatches per cycle and quality. Use `grep ^MPC | cut -f 2-` to extract this part.\n");
935 printf("# Columns correspond to qualities, rows to cycles. First column is the cycle number, second\n");
936 printf("# is the number of N's and the rest is the number of mismatches\n");
937 for (ibase=0; ibase<stats->max_len; ibase++)
939 printf("MPC\t%d",ibase+1);
940 for (iqual=0; iqual<=stats->max_qual; iqual++)
942 printf("\t%ld", (long)stats->mpc_buf[ibase*stats->nquals+iqual]);
947 printf("# GC Content of first fragments. Use `grep ^GCF | cut -f 2-` to extract this part.\n");
949 for (ibase=0; ibase<stats->ngc; ibase++)
951 if ( stats->gc_1st[ibase]==stats->gc_1st[ibase_prev] ) continue;
952 printf("GCF\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_1st[ibase_prev]);
955 printf("# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part.\n");
957 for (ibase=0; ibase<stats->ngc; ibase++)
959 if ( stats->gc_2nd[ibase]==stats->gc_2nd[ibase_prev] ) continue;
960 printf("GCL\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_2nd[ibase_prev]);
963 printf("# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle, and A,C,G,T counts [%%]\n");
964 for (ibase=0; ibase<stats->max_len; ibase++)
966 uint64_t *ptr = &(stats->acgt_cycles[ibase*4]);
967 uint64_t sum = ptr[0]+ptr[1]+ptr[2]+ptr[3];
968 if ( ! sum ) continue;
969 printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum);
971 printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n");
972 for (isize=1; isize<ibulk; isize++)
973 printf("IS\t%d\t%ld\t%ld\t%ld\t%ld\n", isize, (long)(stats->isize_inward[isize]+stats->isize_outward[isize]+stats->isize_other[isize]),
974 (long)stats->isize_inward[isize], (long)stats->isize_outward[isize], (long)stats->isize_other[isize]);
976 printf("# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count\n");
978 for (ilen=0; ilen<stats->max_len; ilen++)
980 if ( stats->read_lengths[ilen]>0 )
981 printf("RL\t%d\t%ld\n", ilen, (long)stats->read_lengths[ilen]);
984 printf("# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n");
985 for (ilen=0; ilen<stats->nindels; ilen++)
987 if ( stats->insertions[ilen]>0 || stats->deletions[ilen]>0 )
988 printf("ID\t%d\t%ld\t%ld\n", ilen+1, (long)stats->insertions[ilen], (long)stats->deletions[ilen]);
991 printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev)\n");
992 for (ilen=0; ilen<=stats->nbases; ilen++)
994 if ( stats->ins_cycles_1st[ilen]>0 || stats->ins_cycles_2nd[ilen]>0 || stats->del_cycles_1st[ilen]>0 || stats->del_cycles_2nd[ilen]>0 )
995 printf("IC\t%d\t%ld\t%ld\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles_1st[ilen], (long)stats->ins_cycles_2nd[ilen], (long)stats->del_cycles_1st[ilen], (long)stats->del_cycles_2nd[ilen]);
998 printf("# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n");
999 printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1, (long)stats->cov[0]);
1001 for (icov=1; icov<stats->ncov-1; icov++)
1002 printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1, (long)stats->cov[icov]);
1003 printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1, (long)stats->cov[stats->ncov-1]);
1006 // Calculate average GC content, then sort by GC and depth
1007 printf("# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n");
1009 for (igcd=0; igcd<stats->igcd; igcd++)
1012 stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc);
1014 if ( stats->gcd[igcd].depth )
1015 stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc / stats->gcd[igcd].depth);
1017 qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp);
1019 while ( igcd < stats->igcd )
1021 // Calculate percentiles (10,25,50,75,90th) for the current GC content and print
1022 uint32_t nbins=0, itmp=igcd;
1023 float gc = stats->gcd[igcd].gc;
1024 while ( itmp<stats->igcd && fabs(stats->gcd[itmp].gc-gc)<0.1 )
1029 printf("GCD\t%.1f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", gc, (igcd+nbins+1)*100./(stats->igcd+1),
1030 gcd_percentile(&(stats->gcd[igcd]),nbins,10) *avg_read_length/stats->gcd_bin_size,
1031 gcd_percentile(&(stats->gcd[igcd]),nbins,25) *avg_read_length/stats->gcd_bin_size,
1032 gcd_percentile(&(stats->gcd[igcd]),nbins,50) *avg_read_length/stats->gcd_bin_size,
1033 gcd_percentile(&(stats->gcd[igcd]),nbins,75) *avg_read_length/stats->gcd_bin_size,
1034 gcd_percentile(&(stats->gcd[igcd]),nbins,90) *avg_read_length/stats->gcd_bin_size
1040 size_t mygetline(char **line, size_t *n, FILE *fp)
1042 if (line == NULL || n == NULL || fp == NULL)
1047 if (*n==0 || !*line)
1055 while ((c=getc(fp))!= EOF && c!='\n')
1060 *line = realloc(*line, sizeof(char)*(*n));
1062 (*line)[nread-1] = c;
1067 *line = realloc(*line, sizeof(char)*(*n));
1070 return nread>0 ? nread : -1;
1074 void init_regions(stats_t *stats, char *file)
1077 khash_t(str) *header_hash;
1079 bam_init_header_hash(stats->sam->header);
1080 header_hash = (khash_t(str)*)stats->sam->header->hash;
1082 FILE *fp = fopen(file,"r");
1083 if ( !fp ) error("%s: %s\n",file,strerror(errno));
1089 int prev_tid=-1, prev_pos=-1;
1090 while ((nread = mygetline(&line, &len, fp)) != -1)
1092 if ( line[0] == '#' ) continue;
1095 while ( i<nread && !isspace(line[i]) ) i++;
1096 if ( i>=nread ) error("Could not parse the file: %s [%s]\n", file,line);
1099 iter = kh_get(str, header_hash, line);
1100 int tid = kh_val(header_hash, iter);
1101 if ( iter == kh_end(header_hash) )
1104 fprintf(stderr,"Warning: Some sequences not present in the BAM, e.g. \"%s\". This message is printed only once.\n", line);
1109 if ( tid >= stats->nregions )
1111 stats->regions = realloc(stats->regions,sizeof(regions_t)*(stats->nregions+100));
1113 for (j=stats->nregions; j<stats->nregions+100; j++)
1115 stats->regions[j].npos = stats->regions[j].mpos = stats->regions[j].cpos = 0;
1116 stats->regions[j].pos = NULL;
1118 stats->nregions += 100;
1120 int npos = stats->regions[tid].npos;
1121 if ( npos >= stats->regions[tid].mpos )
1123 stats->regions[tid].mpos += 1000;
1124 stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos);
1127 if ( (sscanf(line+i+1,"%d %d",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n");
1128 if ( prev_tid==-1 || prev_tid!=tid )
1131 prev_pos = stats->regions[tid].pos[npos].from;
1133 if ( prev_pos>stats->regions[tid].pos[npos].from )
1134 error("The positions are not in chromosomal order (%s:%d comes after %d)\n", line,stats->regions[tid].pos[npos].from,prev_pos);
1135 stats->regions[tid].npos++;
1137 if (line) free(line);
1138 if ( !stats->regions ) error("Unable to map the -t sequences to the BAM sequences.\n");
1142 void destroy_regions(stats_t *stats)
1145 for (i=0; i<stats->nregions; i++)
1147 if ( !stats->regions[i].mpos ) continue;
1148 free(stats->regions[i].pos);
1150 if ( stats->regions ) free(stats->regions);
1153 static int fetch_read(const bam1_t *bam_line, void *data)
1155 collect_stats((bam1_t*)bam_line,(stats_t*)data);
1159 void reset_regions(stats_t *stats)
1162 for (i=0; i<stats->nregions; i++)
1163 stats->regions[i].cpos = 0;
1166 int is_in_regions(bam1_t *bam_line, stats_t *stats)
1168 if ( !stats->regions ) return 1;
1170 if ( bam_line->core.tid >= stats->nregions || bam_line->core.tid<0 ) return 0;
1171 if ( !stats->is_sorted ) error("The BAM must be sorted in order for -t to work.\n");
1173 regions_t *reg = &stats->regions[bam_line->core.tid];
1174 if ( reg->cpos==reg->npos ) return 0; // done for this chr
1176 // Find a matching interval or skip this read. No splicing of reads is done, no indels or soft clips considered,
1177 // even small overlap is enough to include the read in the stats.
1179 while ( i<reg->npos && reg->pos[i].to<=bam_line->core.pos ) i++;
1180 if ( i>=reg->npos ) { reg->cpos = reg->npos; return 0; }
1181 if ( bam_line->core.pos + bam_line->core.l_qseq + 1 < reg->pos[i].from ) return 0;
1183 stats->reg_from = reg->pos[i].from;
1184 stats->reg_to = reg->pos[i].to;
1189 void error(const char *format, ...)
1193 printf("Version: %s\n", BAMCHECK_VERSION);
1194 printf("About: The program collects statistics from BAM files. The output can be visualized using plot-bamcheck.\n");
1195 printf("Usage: bamcheck [OPTIONS] file.bam\n");
1196 printf(" bamcheck [OPTIONS] file.bam chr:from-to\n");
1197 printf("Options:\n");
1198 printf(" -c, --coverage <int>,<int>,<int> Coverage distribution min,max,step [1,1000,1]\n");
1199 printf(" -d, --remove-dups Exlude from statistics reads marked as duplicates\n");
1200 printf(" -f, --required-flag <int> Required flag, 0 for unset [0]\n");
1201 printf(" -F, --filtering-flag <int> Filtering flag, 0 for unset [0]\n");
1202 printf(" -h, --help This help message\n");
1203 printf(" -i, --insert-size <int> Maximum insert size [8000]\n");
1204 printf(" -l, --read-length <int> Include in the statistics only reads with the given read length []\n");
1205 printf(" -m, --most-inserts <float> Report only the main part of inserts [0.99]\n");
1206 printf(" -q, --trim-quality <int> The BWA trimming parameter [0]\n");
1207 printf(" -r, --ref-seq <file> Reference sequence (required for GC-depth calculation).\n");
1208 printf(" -t, --target-regions <file> Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n");
1209 printf(" -s, --sam Input is SAM\n");
1215 va_start(ap, format);
1216 vfprintf(stderr, format, ap);
1222 int main(int argc, char *argv[])
1224 char *targets = NULL;
1225 char *bam_fname = NULL;
1226 samfile_t *sam = NULL;
1229 stats_t *stats = calloc(1,sizeof(stats_t));
1232 stats->nbases = 300;
1233 stats->nisize = 8000;
1234 stats->max_len = 30;
1235 stats->max_qual = 40;
1236 stats->isize_main_bulk = 0.99; // There are always outliers at the far end
1237 stats->gcd_bin_size = 20000;
1238 stats->ngcd = 3e5; // 300k of 20k bins is enough to hold a genome 6Gbp big
1239 stats->nref_seq = stats->gcd_bin_size;
1240 stats->rseq_pos = -1;
1241 stats->tid = stats->gcd_pos = -1;
1242 stats->is_sorted = 1;
1244 stats->cov_max = 1000;
1245 stats->cov_step = 1;
1248 stats->filter_readlen = -1;
1249 stats->nindels = stats->nbases;
1251 strcpy(in_mode, "rb");
1253 static struct option loptions[] =
1256 {"remove-dups",0,0,'d'},
1258 {"ref-seq",0,0,'r'},
1259 {"coverage",0,0,'c'},
1260 {"read-length",0,0,'l'},
1261 {"insert-size",0,0,'i'},
1262 {"most-inserts",0,0,'m'},
1263 {"trim-quality",0,0,'q'},
1264 {"target-regions",0,0,'t'},
1265 {"required-flag",0,0,'f'},
1266 {"filtering-flag",0,0,'F'},
1270 while ( (opt=getopt_long(argc,argv,"?hdsr:c:l:i:t:m:q:f:F:",loptions,NULL))>0 )
1274 case 'f': stats->flag_require=strtol(optarg,0,0); break;
1275 case 'F': stats->flag_filter=strtol(optarg,0,0); break;
1276 case 'd': stats->flag_filter|=BAM_FDUP; break;
1277 case 's': strcpy(in_mode, "r"); break;
1278 case 'r': stats->fai = fai_load(optarg);
1280 error("Could not load faidx: %s\n", optarg);
1282 case 'c': if ( sscanf(optarg,"%d,%d,%d",&stats->cov_min,&stats->cov_max,&stats->cov_step)!= 3 )
1283 error("Unable to parse -c %s\n", optarg);
1285 case 'l': stats->filter_readlen = atoi(optarg); break;
1286 case 'i': stats->nisize = atoi(optarg); break;
1287 case 'm': stats->isize_main_bulk = atof(optarg); break;
1288 case 'q': stats->trim_qual = atoi(optarg); break;
1289 case 't': targets = optarg; break;
1291 case 'h': error(NULL);
1292 default: error("Unknown argument: %s\n", optarg);
1296 bam_fname = argv[optind++];
1300 if ( isatty(fileno((FILE *)stdin)) )
1306 // .. coverage bins and round buffer
1307 if ( stats->cov_step > stats->cov_max - stats->cov_min + 1 )
1309 stats->cov_step = stats->cov_max - stats->cov_min;
1310 if ( stats->cov_step <= 0 )
1311 stats->cov_step = 1;
1313 stats->ncov = 3 + (stats->cov_max-stats->cov_min) / stats->cov_step;
1314 stats->cov_max = stats->cov_min + ((stats->cov_max-stats->cov_min)/stats->cov_step +1)*stats->cov_step - 1;
1315 stats->cov = calloc(sizeof(uint64_t),stats->ncov);
1316 stats->cov_rbuf.size = stats->nbases*5;
1317 stats->cov_rbuf.buffer = calloc(sizeof(int32_t),stats->cov_rbuf.size);
1319 if ((sam = samopen(bam_fname, in_mode, NULL)) == 0)
1320 error("Failed to open: %s\n", bam_fname);
1322 bam1_t *bam_line = bam_init1();
1324 stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
1325 stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
1326 stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t));
1327 stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t));
1328 stats->isize_inward = calloc(stats->nisize,sizeof(uint64_t));
1329 stats->isize_outward = calloc(stats->nisize,sizeof(uint64_t));
1330 stats->isize_other = calloc(stats->nisize,sizeof(uint64_t));
1331 stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t));
1332 stats->rseq_buf = calloc(stats->nref_seq,sizeof(uint8_t));
1333 stats->mpc_buf = stats->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL;
1334 stats->acgt_cycles = calloc(4*stats->nbases,sizeof(uint64_t));
1335 stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t));
1336 stats->insertions = calloc(stats->nbases,sizeof(uint64_t));
1337 stats->deletions = calloc(stats->nbases,sizeof(uint64_t));
1338 stats->ins_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
1339 stats->ins_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
1340 stats->del_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
1341 stats->del_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
1343 init_regions(stats, targets);
1345 // Collect statistics
1348 // Collect stats in selected regions only
1349 bam_index_t *bam_idx = bam_index_load(bam_fname);
1351 error("Random alignment retrieval only works for indexed BAM files.\n");
1354 for (i=optind; i<argc; i++)
1357 bam_parse_region(stats->sam->header, argv[i], &tid, &beg, &end);
1358 if ( tid < 0 ) continue;
1359 reset_regions(stats);
1360 bam_fetch(stats->sam->x.bam, bam_idx, tid, beg, end, stats, fetch_read);
1362 bam_index_destroy(bam_idx);
1366 // Stream through the entire BAM ignoring off-target regions if -t is given
1367 while (samread(sam,bam_line) >= 0)
1368 collect_stats(bam_line,stats);
1370 round_buffer_flush(stats,-1);
1372 output_stats(stats);
1374 bam_destroy1(bam_line);
1375 samclose(stats->sam);
1376 if (stats->fai) fai_destroy(stats->fai);
1377 free(stats->cov_rbuf.buffer); free(stats->cov);
1378 free(stats->quals_1st); free(stats->quals_2nd);
1379 free(stats->gc_1st); free(stats->gc_2nd);
1380 free(stats->isize_inward); free(stats->isize_outward); free(stats->isize_other);
1382 free(stats->rseq_buf);
1383 free(stats->mpc_buf);
1384 free(stats->acgt_cycles);
1385 free(stats->read_lengths);
1386 free(stats->insertions);
1387 free(stats->deletions);
1388 free(stats->ins_cycles_1st);
1389 free(stats->ins_cycles_2nd);
1390 free(stats->del_cycles_1st);
1391 free(stats->del_cycles_2nd);
1392 destroy_regions(stats);