2 Author: petr.danecek@sanger
3 gcc -Wall -Winline -g -O2 -I ~/git/samtools bamcheck.c -o bamcheck -lm -lz -L ~/git/samtools -lbam
5 Assumptions, approximations and other issues:
6 - GC-depth graph does not split reads, the starting position determines which bin is incremented.
7 There are small overlaps between bins (max readlen-1). However, the bins are big (20k).
8 - coverage distribution ignores softclips and deletions
9 - some stats require sorted BAMs
10 - GC content graph can have an untidy, step-like pattern when BAM contains multiple read lengths.
11 - 'bases mapped' (stats->nbases_mapped) is calculated from read lengths given by BAM (core.l_qseq)
12 - With the -t option, the whole reads are used. Except for the number of mapped bases (cigar)
13 counts, no splicing is done, no indels or soft clips are considered, even small overlap is
14 good enough to include the read in the stats.
18 #define BAMCHECK_VERSION "2012-04-24"
20 #define _ISOC99_SOURCE
32 #include "sam_header.h"
35 #define BWA_MIN_RDLEN 35
36 #define IS_PAIRED(bam) ((bam)->core.flag&BAM_FPAIRED && !((bam)->core.flag&BAM_FUNMAP) && !((bam)->core.flag&BAM_FMUNMAP))
37 #define IS_UNMAPPED(bam) ((bam)->core.flag&BAM_FUNMAP)
38 #define IS_REVERSE(bam) ((bam)->core.flag&BAM_FREVERSE)
39 #define IS_MATE_REVERSE(bam) ((bam)->core.flag&BAM_FMREVERSE)
40 #define IS_READ1(bam) ((bam)->core.flag&BAM_FREAD1)
41 #define IS_READ2(bam) ((bam)->core.flag&BAM_FREAD2)
42 #define IS_DUP(bam) ((bam)->core.flag&BAM_FDUP)
46 int32_t line_len, line_blen;
51 KHASH_MAP_INIT_STR(kh_faidx, faidx1_t)
52 KHASH_MAP_INIT_STR(kh_bam_tid, int)
53 KHASH_MAP_INIT_STR(kh_rg, const char *)
58 khash_t(kh_faidx) *hash;
68 // For coverage distribution, a simple pileup
77 typedef struct { uint32_t from, to; } pos_t;
88 int trim_qual; // bwa trim quality
90 // Dimensions of the quality histogram holder (quals_1st,quals_2nd), GC content holder (gc_1st,gc_2nd),
91 // insert size histogram holder
92 int nquals; // The number of quality bins
93 int nbases; // The maximum sequence length the allocated array can hold
94 int nisize; // The maximum insert size that the allocated array can hold
95 int ngc; // The size of gc_1st and gc_2nd
96 int nindels; // The maximum indel length for indel distribution
98 // Arrays for the histogram data
99 uint64_t *quals_1st, *quals_2nd;
100 uint64_t *gc_1st, *gc_2nd;
101 uint64_t *isize_inward, *isize_outward, *isize_other;
102 uint64_t *acgt_cycles;
103 uint64_t *read_lengths;
104 uint64_t *insertions, *deletions;
105 uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd;
107 // The extremes encountered
108 int max_len; // Maximum read length
109 int max_qual; // Maximum quality
110 float isize_main_bulk; // There are always some unrealistically big insert sizes, report only the main part
115 uint64_t total_len_dup;
119 uint64_t nreads_unmapped;
120 uint64_t nreads_unpaired;
121 uint64_t nreads_paired;
123 uint64_t nbases_mapped;
124 uint64_t nbases_mapped_cigar;
125 uint64_t nbases_trimmed; // bwa trimmed bases
126 uint64_t nmismatches;
128 // GC-depth related data
129 uint32_t ngcd, igcd; // The maximum number of GC depth bins and index of the current bin
130 gc_depth_t *gcd; // The GC-depth bins holder
131 int gcd_bin_size; // The size of GC-depth bin
132 int32_t tid, gcd_pos; // Position of the current bin
133 int32_t pos; // Position of the last read
135 // Coverage distribution related data
136 int ncov; // The number of coverage bins
137 uint64_t *cov; // The coverage frequencies
138 int cov_min,cov_max,cov_step; // Minimum, maximum coverage and size of the coverage bins
139 round_buffer_t cov_rbuf; // Pileup round buffer
141 // Mismatches by read cycle
142 uint8_t *rseq_buf; // A buffer for reference sequence to check the mismatches against
143 int nref_seq; // The size of the buffer
144 int32_t rseq_pos; // The coordinate of the first base in the buffer
145 int32_t rseq_len; // The used part of the buffer
146 uint64_t *mpc_buf; // Mismatches per cycle
152 int nregions, reg_from,reg_to;
156 int flag_require, flag_filter;
157 double sum_qual; // For calculating average quality value
159 khash_t(kh_rg) *rg_hash; // Read groups to include, the array is null-terminated
160 faidx_t *fai; // Reference sequence for GC-depth graph
161 int argc; // Command line arguments to be printed on the output
166 void error(const char *format, ...);
167 void bam_init_header_hash(bam_header_t *header);
168 int is_in_regions(bam1_t *bam_line, stats_t *stats);
171 // Coverage distribution methods
172 inline int coverage_idx(int min, int max, int n, int step, int depth)
180 return 1 + (depth - min) / step;
183 inline int round_buffer_lidx2ridx(int offset, int size, int64_t refpos, int64_t pos)
185 return (offset + (pos-refpos) % size) % size;
188 void round_buffer_flush(stats_t *stats, int64_t pos)
192 if ( pos==stats->cov_rbuf.pos )
195 int64_t new_pos = pos;
196 if ( pos==-1 || pos - stats->cov_rbuf.pos >= stats->cov_rbuf.size )
198 // Flush the whole buffer, but in sequential order,
199 pos = stats->cov_rbuf.pos + stats->cov_rbuf.size - 1;
202 if ( pos < stats->cov_rbuf.pos )
203 error("Expected coordinates in ascending order, got %ld after %ld\n", pos,stats->cov_rbuf.pos);
205 int ifrom = stats->cov_rbuf.start;
206 int ito = round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos-1);
209 for (ibuf=ifrom; ibuf<stats->cov_rbuf.size; ibuf++)
211 if ( !stats->cov_rbuf.buffer[ibuf] )
213 idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
215 stats->cov_rbuf.buffer[ibuf] = 0;
219 for (ibuf=ifrom; ibuf<=ito; ibuf++)
221 if ( !stats->cov_rbuf.buffer[ibuf] )
223 idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
225 stats->cov_rbuf.buffer[ibuf] = 0;
227 stats->cov_rbuf.start = (new_pos==-1) ? 0 : round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos);
228 stats->cov_rbuf.pos = new_pos;
231 void round_buffer_insert_read(round_buffer_t *rbuf, int64_t from, int64_t to)
233 if ( to-from >= rbuf->size )
234 error("The read length too big (%d), please increase the buffer length (currently %d)\n", to-from+1,rbuf->size);
235 if ( from < rbuf->pos )
236 error("The reads are not sorted (%ld comes after %ld).\n", from,rbuf->pos);
239 ifrom = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,from);
240 ito = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,to);
243 for (ibuf=ifrom; ibuf<rbuf->size; ibuf++)
244 rbuf->buffer[ibuf]++;
247 for (ibuf=ifrom; ibuf<=ito; ibuf++)
248 rbuf->buffer[ibuf]++;
251 // Calculate the number of bases in the read trimmed by BWA
252 int bwa_trim_read(int trim_qual, uint8_t *quals, int len, int reverse)
254 if ( len<BWA_MIN_RDLEN ) return 0;
256 // Although the name implies that the read cannot be trimmed to more than BWA_MIN_RDLEN,
257 // the calculation can in fact trim it to (BWA_MIN_RDLEN-1). (bwa_trim_read in bwa/bwaseqio.c).
258 int max_trimmed = len - BWA_MIN_RDLEN + 1;
259 int l, sum=0, max_sum=0, max_l=0;
261 for (l=0; l<max_trimmed; l++)
263 sum += trim_qual - quals[ reverse ? l : len-1-l ];
268 // This is the correct way, but bwa clips from some reason one base less
277 void count_indels(stats_t *stats,bam1_t *bam_line)
279 int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
280 int is_1st = IS_READ1(bam_line) ? 1 : 0;
283 int read_len = bam_line->core.l_qseq;
284 for (icig=0; icig<bam_line->core.n_cigar; icig++)
286 // Conversion from uint32_t to MIDNSHP
289 int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
290 int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
294 int idx = is_fwd ? icycle : read_len-icycle;
295 if ( idx<0 ) error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle);
296 if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d\n", idx,stats->nbases);
298 stats->ins_cycles_1st[idx]++;
300 stats->ins_cycles_2nd[idx]++;
302 if ( ncig<=stats->nindels )
303 stats->insertions[ncig-1]++;
308 int idx = is_fwd ? icycle-1 : read_len-icycle-1;
309 if ( idx<0 ) continue; // discard meaningless deletions
310 if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases);
312 stats->del_cycles_1st[idx]++;
314 stats->del_cycles_2nd[idx]++;
315 if ( ncig<=stats->nindels )
316 stats->deletions[ncig-1]++;
323 void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line)
325 int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
326 int icig,iread=0,icycle=0;
327 int iref = bam_line->core.pos - stats->rseq_pos;
328 int read_len = bam_line->core.l_qseq;
329 uint8_t *read = bam1_seq(bam_line);
330 uint8_t *quals = bam1_qual(bam_line);
331 uint64_t *mpc_buf = stats->mpc_buf;
332 for (icig=0; icig<bam_line->core.n_cigar; icig++)
334 // Conversion from uint32_t to MIDNSHP
337 int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
338 int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
353 // Soft-clips are present in the sequence, but the position of the read marks a start of non-clipped sequence
364 error("TODO: cigar %d, %s\n", cig,bam1_qname(bam_line));
366 if ( ncig+iref > stats->rseq_len )
367 error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->rseq_len, bam1_qname(bam_line),stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1);
370 for (im=0; im<ncig; im++)
372 uint8_t cread = bam1_seqi(read,iread);
373 uint8_t cref = stats->rseq_buf[iref];
379 int idx = is_fwd ? icycle : read_len-icycle-1;
380 if ( idx>stats->max_len )
381 error("mpc: %d>%d\n",idx,stats->max_len);
382 idx = idx*stats->nquals;
383 if ( idx>=stats->nquals*stats->nbases )
384 error("FIXME: mpc_buf overflow\n");
387 else if ( cref && cread && cref!=cread )
389 uint8_t qual = quals[iread] + 1;
390 if ( qual>=stats->nquals )
391 error("TODO: quality too high %d>=%d\n", quals[iread],stats->nquals);
393 int idx = is_fwd ? icycle : read_len-icycle-1;
394 if ( idx>stats->max_len )
395 error("mpc: %d>%d\n",idx,stats->max_len);
397 idx = idx*stats->nquals + qual;
398 if ( idx>=stats->nquals*stats->nbases )
399 error("FIXME: mpc_buf overflow\n");
410 void read_ref_seq(stats_t *stats,int32_t tid,int32_t pos)
412 khash_t(kh_faidx) *h;
416 faidx_t *fai = stats->fai;
419 chr = stats->sam->header->target_name[tid];
421 // ID of the sequence name
422 iter = kh_get(kh_faidx, h, chr);
423 if (iter == kh_end(h))
424 error("No such reference sequence [%s]?\n", chr);
425 val = kh_value(h, iter);
427 // Check the boundaries
429 error("Was the bam file mapped with the reference sequence supplied?"
430 " A read mapped beyond the end of the chromosome (%s:%d, chromosome length %d).\n", chr,pos,val.len);
431 int size = stats->nref_seq;
432 // The buffer extends beyond the chromosome end. Later the rest will be filled with N's.
433 if (size+pos > val.len) size = val.len-pos;
435 // Position the razf reader
436 razf_seek(fai->rz, val.offset + pos / val.line_blen * val.line_len + pos % val.line_blen, SEEK_SET);
438 uint8_t *ptr = stats->rseq_buf;
440 while ( nread<size && razf_read(fai->rz,&c,1) && !fai->rz->z_err )
445 // Conversion between uint8_t coding and ACGT
448 if ( c=='A' || c=='a' )
450 else if ( c=='C' || c=='c' )
452 else if ( c=='G' || c=='g' )
454 else if ( c=='T' || c=='t' )
461 if ( nread < stats->nref_seq )
463 memset(ptr,0, stats->nref_seq - nread);
464 nread = stats->nref_seq;
466 stats->rseq_len = nread;
467 stats->rseq_pos = pos;
471 float fai_gc_content(stats_t *stats)
474 int i,size = stats->rseq_len;
478 for (i=0; i<size; i++)
480 c = stats->rseq_buf[i];
486 else if ( c==1 || c==8 )
489 return count ? (float)gc/count : 0;
493 void realloc_buffers(stats_t *stats, int seq_len)
495 int n = 2*(1 + seq_len - stats->nbases) + stats->nbases;
497 stats->quals_1st = realloc(stats->quals_1st, n*stats->nquals*sizeof(uint64_t));
498 if ( !stats->quals_1st )
499 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
500 memset(stats->quals_1st + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
502 stats->quals_2nd = realloc(stats->quals_2nd, n*stats->nquals*sizeof(uint64_t));
503 if ( !stats->quals_2nd )
504 error("Could not realloc buffers, the sequence too long: %d (2x%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
505 memset(stats->quals_2nd + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
507 if ( stats->mpc_buf )
509 stats->mpc_buf = realloc(stats->mpc_buf, n*stats->nquals*sizeof(uint64_t));
510 if ( !stats->mpc_buf )
511 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
512 memset(stats->mpc_buf + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
515 stats->acgt_cycles = realloc(stats->acgt_cycles, n*4*sizeof(uint64_t));
516 if ( !stats->acgt_cycles )
517 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*4*sizeof(uint64_t));
518 memset(stats->acgt_cycles + stats->nbases*4, 0, (n-stats->nbases)*4*sizeof(uint64_t));
520 stats->read_lengths = realloc(stats->read_lengths, n*sizeof(uint64_t));
521 if ( !stats->read_lengths )
522 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
523 memset(stats->read_lengths + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
525 stats->insertions = realloc(stats->insertions, n*sizeof(uint64_t));
526 if ( !stats->insertions )
527 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
528 memset(stats->insertions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
530 stats->deletions = realloc(stats->deletions, n*sizeof(uint64_t));
531 if ( !stats->deletions )
532 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
533 memset(stats->deletions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
535 stats->ins_cycles_1st = realloc(stats->ins_cycles_1st, (n+1)*sizeof(uint64_t));
536 if ( !stats->ins_cycles_1st )
537 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
538 memset(stats->ins_cycles_1st + stats->nbases + 1, 0, (n+1-stats->nbases)*sizeof(uint64_t));
540 stats->ins_cycles_2nd = realloc(stats->ins_cycles_2nd, (n+1)*sizeof(uint64_t));
541 if ( !stats->ins_cycles_2nd )
542 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
543 memset(stats->ins_cycles_2nd + stats->nbases + 1, 0, (n+1-stats->nbases)*sizeof(uint64_t));
545 stats->del_cycles_1st = realloc(stats->del_cycles_1st, (n+1)*sizeof(uint64_t));
546 if ( !stats->del_cycles_1st )
547 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
548 memset(stats->del_cycles_1st + stats->nbases + 1, 0, (n+1-stats->nbases)*sizeof(uint64_t));
550 stats->del_cycles_2nd = realloc(stats->del_cycles_2nd, (n+1)*sizeof(uint64_t));
551 if ( !stats->del_cycles_2nd )
552 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
553 memset(stats->del_cycles_2nd + stats->nbases + 1, 0, (n+1-stats->nbases)*sizeof(uint64_t));
557 // Realloc the coverage distribution buffer
558 int *rbuffer = calloc(sizeof(int),seq_len*5);
559 n = stats->cov_rbuf.size-stats->cov_rbuf.start;
560 memcpy(rbuffer,stats->cov_rbuf.buffer+stats->cov_rbuf.start,n);
561 if ( stats->cov_rbuf.start>1 )
562 memcpy(rbuffer+n,stats->cov_rbuf.buffer,stats->cov_rbuf.start);
563 stats->cov_rbuf.start = 0;
564 free(stats->cov_rbuf.buffer);
565 stats->cov_rbuf.buffer = rbuffer;
566 stats->cov_rbuf.size = seq_len*5;
569 void collect_stats(bam1_t *bam_line, stats_t *stats)
571 if ( stats->rg_hash )
573 const uint8_t *rg = bam_aux_get(bam_line, "RG");
575 khiter_t k = kh_get(kh_rg, stats->rg_hash, (const char*)(rg + 1));
576 if ( k == kh_end(stats->rg_hash) ) return;
578 if ( stats->flag_require && (bam_line->core.flag & stats->flag_require)!=stats->flag_require )
580 if ( stats->flag_filter && (bam_line->core.flag & stats->flag_filter) )
583 if ( !is_in_regions(bam_line,stats) )
586 int seq_len = bam_line->core.l_qseq;
587 if ( !seq_len ) return;
588 if ( stats->filter_readlen!=-1 && seq_len!=stats->filter_readlen ) return;
589 if ( seq_len >= stats->nbases )
590 realloc_buffers(stats,seq_len);
591 if ( stats->max_len<seq_len )
592 stats->max_len = seq_len;
594 stats->read_lengths[seq_len]++;
596 // Count GC and ACGT per cycle
597 uint8_t base, *seq = bam1_seq(bam_line);
600 int reverse = IS_REVERSE(bam_line);
601 for (i=0; i<seq_len; i++)
603 // Conversion from uint8_t coding to ACGT
607 base = bam1_seqi(seq,i);
609 if ( base==1 || base==2 ) gc_count++;
610 else if ( base>2 ) base=3;
611 if ( 4*(reverse ? seq_len-i-1 : i) + base >= stats->nbases*4 )
612 error("FIXME: acgt_cycles\n");
613 stats->acgt_cycles[ 4*(reverse ? seq_len-i-1 : i) + base ]++;
615 int gc_idx_min = gc_count*(stats->ngc-1)/seq_len;
616 int gc_idx_max = (gc_count+1)*(stats->ngc-1)/seq_len;
617 if ( gc_idx_max >= stats->ngc ) gc_idx_max = stats->ngc - 1;
619 // Determine which array (1st or 2nd read) will these stats go to,
620 // trim low quality bases from end the same way BWA does,
623 uint8_t *bam_quals = bam1_qual(bam_line);
624 if ( bam_line->core.flag&BAM_FREAD2 )
626 quals = stats->quals_2nd;
628 for (i=gc_idx_min; i<gc_idx_max; i++)
633 quals = stats->quals_1st;
635 for (i=gc_idx_min; i<gc_idx_max; i++)
638 if ( stats->trim_qual>0 )
639 stats->nbases_trimmed += bwa_trim_read(stats->trim_qual, bam_quals, seq_len, reverse);
641 // Quality histogram and average quality
642 for (i=0; i<seq_len; i++)
644 uint8_t qual = bam_quals[ reverse ? seq_len-i-1 : i];
645 if ( qual>=stats->nquals )
646 error("TODO: quality too high %d>=%d\n", quals[i],stats->nquals);
647 if ( qual>stats->max_qual )
648 stats->max_qual = qual;
650 quals[ i*stats->nquals+qual ]++;
651 stats->sum_qual += qual;
654 // Look at the flags and increment appropriate counters (mapped, paired, etc)
655 if ( IS_UNMAPPED(bam_line) )
656 stats->nreads_unmapped++;
659 if ( !bam_line->core.qual )
662 count_indels(stats,bam_line);
664 // The insert size is tricky, because for long inserts the libraries are
665 // prepared differently and the pairs point in other direction. BWA does
666 // not set the paired flag for them. Similar thing is true also for 454
667 // reads. Therefore, do the insert size stats for all mapped reads.
668 int32_t isize = bam_line->core.isize;
669 if ( isize<0 ) isize = -isize;
670 if ( IS_PAIRED(bam_line) && isize!=0 )
672 stats->nreads_paired++;
673 if ( isize >= stats->nisize )
674 isize=stats->nisize-1;
676 int pos_fst = bam_line->core.mpos - bam_line->core.pos;
677 int is_fst = IS_READ1(bam_line) ? 1 : -1;
678 int is_fwd = IS_REVERSE(bam_line) ? -1 : 1;
679 int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1;
681 if ( is_fwd*is_mfwd>0 )
682 stats->isize_other[isize]++;
683 else if ( is_fst*pos_fst>0 )
685 if ( is_fst*is_fwd>0 )
686 stats->isize_inward[isize]++;
688 stats->isize_outward[isize]++;
690 else if ( is_fst*pos_fst<0 )
692 if ( is_fst*is_fwd>0 )
693 stats->isize_outward[isize]++;
695 stats->isize_inward[isize]++;
699 stats->nreads_unpaired++;
701 // Number of mismatches
702 uint8_t *nm = bam_aux_get(bam_line,"NM");
704 stats->nmismatches += bam_aux2i(nm);
706 // Number of mapped bases from cigar
707 // Conversion from uint32_t to MIDNSHP
710 if ( bam_line->core.n_cigar == 0)
711 error("FIXME: mapped read with no cigar?\n");
713 if ( stats->regions )
715 // Count only on-target bases
716 int iref = bam_line->core.pos + 1;
717 for (i=0; i<bam_line->core.n_cigar; i++)
719 int cig = bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK;
720 int ncig = bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
721 if ( cig==2 ) readlen += ncig;
724 if ( iref < stats->reg_from ) ncig -= stats->reg_from-iref;
725 else if ( iref+ncig-1 > stats->reg_to ) ncig -= iref+ncig-1 - stats->reg_to;
726 if ( ncig<0 ) ncig = 0;
727 stats->nbases_mapped_cigar += ncig;
728 iref += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
733 if ( iref>=stats->reg_from && iref<=stats->reg_to )
734 stats->nbases_mapped_cigar += ncig;
740 // Count the whole read
741 for (i=0; i<bam_line->core.n_cigar; i++)
743 if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==0 || (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==1 )
744 stats->nbases_mapped_cigar += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
745 if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==2 )
746 readlen += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
749 stats->nbases_mapped += seq_len;
751 if ( stats->tid==bam_line->core.tid && bam_line->core.pos<stats->pos )
752 stats->is_sorted = 0;
753 stats->pos = bam_line->core.pos;
755 if ( stats->is_sorted )
757 if ( stats->tid==-1 || stats->tid!=bam_line->core.tid )
758 round_buffer_flush(stats,-1);
760 // Mismatches per cycle and GC-depth graph
763 // First pass, new chromosome or sequence spanning beyond the end of the buffer
764 if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid || stats->rseq_pos+stats->rseq_len < bam_line->core.pos+readlen )
766 read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos);
768 // Get the reference GC content for this bin. Note that in the current implementation
769 // the GCD bins overlap by seq_len-1. Because the bin size is by default 20k and the
770 // expected read length only ~100bp, it shouldn't really matter.
771 stats->gcd_pos = bam_line->core.pos;
774 if ( stats->igcd >= stats->ngcd )
775 error("The genome too long or the GCD bin overlaps too big [%ud]\n", stats->igcd);
777 stats->gcd[ stats->igcd ].gc = fai_gc_content(stats);
779 count_mismatches_per_cycle(stats,bam_line);
781 else if ( stats->gcd_pos==-1 || stats->tid != bam_line->core.tid || bam_line->core.pos - stats->gcd_pos > stats->gcd_bin_size )
783 // First pass or a new chromosome
784 stats->tid = bam_line->core.tid;
785 stats->gcd_pos = bam_line->core.pos;
788 if ( stats->igcd >= stats->ngcd )
790 uint32_t n = 2*(1 + stats->ngcd);
791 stats->gcd = realloc(stats->gcd, n*sizeof(gc_depth_t));
793 error("Could not realloc GCD buffer, too many chromosomes or the genome too long?? [%u %u]\n", stats->ngcd,n);
794 memset(&(stats->gcd[stats->ngcd]),0,(n-stats->ngcd)*sizeof(gc_depth_t));
798 stats->gcd[ stats->igcd ].depth++;
799 // When no reference sequence is given, approximate the GC from the read (much shorter window, but otherwise OK)
801 stats->gcd[ stats->igcd ].gc += (float) gc_count / seq_len;
803 // Coverage distribution graph
804 round_buffer_flush(stats,bam_line->core.pos);
805 round_buffer_insert_read(&(stats->cov_rbuf),bam_line->core.pos,bam_line->core.pos+seq_len-1);
809 stats->total_len += seq_len;
810 if ( IS_DUP(bam_line) )
812 stats->total_len_dup += seq_len;
817 // Sort by GC and depth
818 #define GCD_t(x) ((gc_depth_t *)x)
819 static int gcd_cmp(const void *a, const void *b)
821 if ( GCD_t(a)->gc < GCD_t(b)->gc ) return -1;
822 if ( GCD_t(a)->gc > GCD_t(b)->gc ) return 1;
823 if ( GCD_t(a)->depth < GCD_t(b)->depth ) return -1;
824 if ( GCD_t(a)->depth > GCD_t(b)->depth ) return 1;
829 float gcd_percentile(gc_depth_t *gcd, int N, int p)
839 return gcd[N-1].depth;
842 return gcd[k-1].depth + d*(gcd[k].depth - gcd[k-1].depth);
845 void output_stats(stats_t *stats)
847 // Calculate average insert size and standard deviation (from the main bulk data only)
849 uint64_t nisize=0, nisize_inward=0, nisize_outward=0, nisize_other=0;
850 for (isize=1; isize<stats->nisize; isize++)
852 // Each pair was counted twice
853 stats->isize_inward[isize] *= 0.5;
854 stats->isize_outward[isize] *= 0.5;
855 stats->isize_other[isize] *= 0.5;
857 nisize_inward += stats->isize_inward[isize];
858 nisize_outward += stats->isize_outward[isize];
859 nisize_other += stats->isize_other[isize];
860 nisize += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
863 double bulk=0, avg_isize=0, sd_isize=0;
864 for (isize=1; isize<stats->nisize; isize++)
866 bulk += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
867 avg_isize += isize * (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]);
869 if ( bulk/nisize > stats->isize_main_bulk )
876 avg_isize /= nisize ? nisize : 1;
877 for (isize=1; isize<ibulk; isize++)
878 sd_isize += (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]) * (isize-avg_isize)*(isize-avg_isize) / nisize;
879 sd_isize = sqrt(sd_isize);
882 printf("# This file was produced by bamcheck (%s)\n",BAMCHECK_VERSION);
883 printf("# The command line was: %s",stats->argv[0]);
885 for (i=1; i<stats->argc; i++)
886 printf(" %s",stats->argv[i]);
888 printf("# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n");
889 printf("SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd));
890 printf("SN\tis paired:\t%d\n", stats->nreads_1st&&stats->nreads_2nd ? 1 : 0);
891 printf("SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0);
892 printf("SN\t1st fragments:\t%ld\n", (long)stats->nreads_1st);
893 printf("SN\tlast fragments:\t%ld\n", (long)stats->nreads_2nd);
894 printf("SN\treads mapped:\t%ld\n", (long)(stats->nreads_paired+stats->nreads_unpaired));
895 printf("SN\treads unmapped:\t%ld\n", (long)stats->nreads_unmapped);
896 printf("SN\treads unpaired:\t%ld\n", (long)stats->nreads_unpaired);
897 printf("SN\treads paired:\t%ld\n", (long)stats->nreads_paired);
898 printf("SN\treads duplicated:\t%ld\n", (long)stats->nreads_dup);
899 printf("SN\treads MQ0:\t%ld\n", (long)stats->nreads_mq0);
900 printf("SN\ttotal length:\t%ld\n", (long)stats->total_len);
901 printf("SN\tbases mapped:\t%ld\n", (long)stats->nbases_mapped);
902 printf("SN\tbases mapped (cigar):\t%ld\n", (long)stats->nbases_mapped_cigar);
903 printf("SN\tbases trimmed:\t%ld\n", (long)stats->nbases_trimmed);
904 printf("SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup);
905 printf("SN\tmismatches:\t%ld\n", (long)stats->nmismatches);
906 printf("SN\terror rate:\t%e\n", (float)stats->nmismatches/stats->nbases_mapped_cigar);
907 float avg_read_length = (stats->nreads_1st+stats->nreads_2nd)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd):0;
908 printf("SN\taverage length:\t%.0f\n", avg_read_length);
909 printf("SN\tmaximum length:\t%d\n", stats->max_len);
910 printf("SN\taverage quality:\t%.1f\n", stats->total_len?stats->sum_qual/stats->total_len:0);
911 printf("SN\tinsert size average:\t%.1f\n", avg_isize);
912 printf("SN\tinsert size standard deviation:\t%.1f\n", sd_isize);
913 printf("SN\tinward oriented pairs:\t%ld\n", (long)nisize_inward);
914 printf("SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward);
915 printf("SN\tpairs with other orientation:\t%ld\n", (long)nisize_other);
918 if ( stats->max_len<stats->nbases ) stats->max_len++;
919 if ( stats->max_qual+1<stats->nquals ) stats->max_qual++;
920 printf("# First Fragment Qualitites. Use `grep ^FFQ | cut -f 2-` to extract this part.\n");
921 printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
922 for (ibase=0; ibase<stats->max_len; ibase++)
924 printf("FFQ\t%d",ibase+1);
925 for (iqual=0; iqual<=stats->max_qual; iqual++)
927 printf("\t%ld", (long)stats->quals_1st[ibase*stats->nquals+iqual]);
931 printf("# Last Fragment Qualitites. Use `grep ^LFQ | cut -f 2-` to extract this part.\n");
932 printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
933 for (ibase=0; ibase<stats->max_len; ibase++)
935 printf("LFQ\t%d",ibase+1);
936 for (iqual=0; iqual<=stats->max_qual; iqual++)
938 printf("\t%ld", (long)stats->quals_2nd[ibase*stats->nquals+iqual]);
942 if ( stats->mpc_buf )
944 printf("# Mismatches per cycle and quality. Use `grep ^MPC | cut -f 2-` to extract this part.\n");
945 printf("# Columns correspond to qualities, rows to cycles. First column is the cycle number, second\n");
946 printf("# is the number of N's and the rest is the number of mismatches\n");
947 for (ibase=0; ibase<stats->max_len; ibase++)
949 printf("MPC\t%d",ibase+1);
950 for (iqual=0; iqual<=stats->max_qual; iqual++)
952 printf("\t%ld", (long)stats->mpc_buf[ibase*stats->nquals+iqual]);
957 printf("# GC Content of first fragments. Use `grep ^GCF | cut -f 2-` to extract this part.\n");
959 for (ibase=0; ibase<stats->ngc; ibase++)
961 if ( stats->gc_1st[ibase]==stats->gc_1st[ibase_prev] ) continue;
962 printf("GCF\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_1st[ibase_prev]);
965 printf("# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part.\n");
967 for (ibase=0; ibase<stats->ngc; ibase++)
969 if ( stats->gc_2nd[ibase]==stats->gc_2nd[ibase_prev] ) continue;
970 printf("GCL\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_2nd[ibase_prev]);
973 printf("# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle, and A,C,G,T counts [%%]\n");
974 for (ibase=0; ibase<stats->max_len; ibase++)
976 uint64_t *ptr = &(stats->acgt_cycles[ibase*4]);
977 uint64_t sum = ptr[0]+ptr[1]+ptr[2]+ptr[3];
978 if ( ! sum ) continue;
979 printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum);
981 printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n");
982 for (isize=1; isize<ibulk; isize++)
983 printf("IS\t%d\t%ld\t%ld\t%ld\t%ld\n", isize, (long)(stats->isize_inward[isize]+stats->isize_outward[isize]+stats->isize_other[isize]),
984 (long)stats->isize_inward[isize], (long)stats->isize_outward[isize], (long)stats->isize_other[isize]);
986 printf("# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count\n");
988 for (ilen=0; ilen<stats->max_len; ilen++)
990 if ( stats->read_lengths[ilen]>0 )
991 printf("RL\t%d\t%ld\n", ilen, (long)stats->read_lengths[ilen]);
994 printf("# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n");
995 for (ilen=0; ilen<stats->nindels; ilen++)
997 if ( stats->insertions[ilen]>0 || stats->deletions[ilen]>0 )
998 printf("ID\t%d\t%ld\t%ld\n", ilen+1, (long)stats->insertions[ilen], (long)stats->deletions[ilen]);
1001 printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev)\n");
1002 for (ilen=0; ilen<=stats->nbases; ilen++)
1004 if ( stats->ins_cycles_1st[ilen]>0 || stats->ins_cycles_2nd[ilen]>0 || stats->del_cycles_1st[ilen]>0 || stats->del_cycles_2nd[ilen]>0 )
1005 printf("IC\t%d\t%ld\t%ld\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles_1st[ilen], (long)stats->ins_cycles_2nd[ilen], (long)stats->del_cycles_1st[ilen], (long)stats->del_cycles_2nd[ilen]);
1008 printf("# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n");
1009 printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1, (long)stats->cov[0]);
1011 for (icov=1; icov<stats->ncov-1; icov++)
1012 printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1, (long)stats->cov[icov]);
1013 printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1, (long)stats->cov[stats->ncov-1]);
1016 // Calculate average GC content, then sort by GC and depth
1017 printf("# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n");
1019 for (igcd=0; igcd<stats->igcd; igcd++)
1022 stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc);
1024 if ( stats->gcd[igcd].depth )
1025 stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc / stats->gcd[igcd].depth);
1027 qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp);
1029 while ( igcd < stats->igcd )
1031 // Calculate percentiles (10,25,50,75,90th) for the current GC content and print
1032 uint32_t nbins=0, itmp=igcd;
1033 float gc = stats->gcd[igcd].gc;
1034 while ( itmp<stats->igcd && fabs(stats->gcd[itmp].gc-gc)<0.1 )
1039 printf("GCD\t%.1f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", gc, (igcd+nbins+1)*100./(stats->igcd+1),
1040 gcd_percentile(&(stats->gcd[igcd]),nbins,10) *avg_read_length/stats->gcd_bin_size,
1041 gcd_percentile(&(stats->gcd[igcd]),nbins,25) *avg_read_length/stats->gcd_bin_size,
1042 gcd_percentile(&(stats->gcd[igcd]),nbins,50) *avg_read_length/stats->gcd_bin_size,
1043 gcd_percentile(&(stats->gcd[igcd]),nbins,75) *avg_read_length/stats->gcd_bin_size,
1044 gcd_percentile(&(stats->gcd[igcd]),nbins,90) *avg_read_length/stats->gcd_bin_size
1050 size_t mygetline(char **line, size_t *n, FILE *fp)
1052 if (line == NULL || n == NULL || fp == NULL)
1057 if (*n==0 || !*line)
1065 while ((c=getc(fp))!= EOF && c!='\n')
1070 *line = realloc(*line, sizeof(char)*(*n));
1072 (*line)[nread-1] = c;
1077 *line = realloc(*line, sizeof(char)*(*n));
1080 return nread>0 ? nread : -1;
1084 void init_regions(stats_t *stats, char *file)
1087 khash_t(kh_bam_tid) *header_hash;
1089 bam_init_header_hash(stats->sam->header);
1090 header_hash = (khash_t(kh_bam_tid)*)stats->sam->header->hash;
1092 FILE *fp = fopen(file,"r");
1093 if ( !fp ) error("%s: %s\n",file,strerror(errno));
1099 int prev_tid=-1, prev_pos=-1;
1100 while ((nread = mygetline(&line, &len, fp)) != -1)
1102 if ( line[0] == '#' ) continue;
1105 while ( i<nread && !isspace(line[i]) ) i++;
1106 if ( i>=nread ) error("Could not parse the file: %s [%s]\n", file,line);
1109 iter = kh_get(kh_bam_tid, header_hash, line);
1110 int tid = kh_val(header_hash, iter);
1111 if ( iter == kh_end(header_hash) )
1114 fprintf(stderr,"Warning: Some sequences not present in the BAM, e.g. \"%s\". This message is printed only once.\n", line);
1119 if ( tid >= stats->nregions )
1121 stats->regions = realloc(stats->regions,sizeof(regions_t)*(stats->nregions+100));
1123 for (j=stats->nregions; j<stats->nregions+100; j++)
1125 stats->regions[j].npos = stats->regions[j].mpos = stats->regions[j].cpos = 0;
1126 stats->regions[j].pos = NULL;
1128 stats->nregions += 100;
1130 int npos = stats->regions[tid].npos;
1131 if ( npos >= stats->regions[tid].mpos )
1133 stats->regions[tid].mpos += 1000;
1134 stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos);
1137 if ( (sscanf(line+i+1,"%d %d",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n");
1138 if ( prev_tid==-1 || prev_tid!=tid )
1141 prev_pos = stats->regions[tid].pos[npos].from;
1143 if ( prev_pos>stats->regions[tid].pos[npos].from )
1144 error("The positions are not in chromosomal order (%s:%d comes after %d)\n", line,stats->regions[tid].pos[npos].from,prev_pos);
1145 stats->regions[tid].npos++;
1147 if (line) free(line);
1148 if ( !stats->regions ) error("Unable to map the -t sequences to the BAM sequences.\n");
1152 void destroy_regions(stats_t *stats)
1155 for (i=0; i<stats->nregions; i++)
1157 if ( !stats->regions[i].mpos ) continue;
1158 free(stats->regions[i].pos);
1160 if ( stats->regions ) free(stats->regions);
1163 static int fetch_read(const bam1_t *bam_line, void *data)
1165 collect_stats((bam1_t*)bam_line,(stats_t*)data);
1169 void reset_regions(stats_t *stats)
1172 for (i=0; i<stats->nregions; i++)
1173 stats->regions[i].cpos = 0;
1176 int is_in_regions(bam1_t *bam_line, stats_t *stats)
1178 if ( !stats->regions ) return 1;
1180 if ( bam_line->core.tid >= stats->nregions || bam_line->core.tid<0 ) return 0;
1181 if ( !stats->is_sorted ) error("The BAM must be sorted in order for -t to work.\n");
1183 regions_t *reg = &stats->regions[bam_line->core.tid];
1184 if ( reg->cpos==reg->npos ) return 0; // done for this chr
1186 // Find a matching interval or skip this read. No splicing of reads is done, no indels or soft clips considered,
1187 // even small overlap is enough to include the read in the stats.
1189 while ( i<reg->npos && reg->pos[i].to<=bam_line->core.pos ) i++;
1190 if ( i>=reg->npos ) { reg->cpos = reg->npos; return 0; }
1191 if ( bam_line->core.pos + bam_line->core.l_qseq + 1 < reg->pos[i].from ) return 0;
1193 stats->reg_from = reg->pos[i].from;
1194 stats->reg_to = reg->pos[i].to;
1199 void init_group_id(stats_t *stats, char *id)
1201 if ( !stats->sam->header->dict )
1202 stats->sam->header->dict = sam_header_parse2(stats->sam->header->text);
1203 void *iter = stats->sam->header->dict;
1204 const char *key, *val;
1206 stats->rg_hash = kh_init(kh_rg);
1207 while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) )
1209 if ( !strcmp(id,key) || (val && !strcmp(id,val)) )
1211 khiter_t k = kh_get(kh_rg, stats->rg_hash, key);
1212 if ( k != kh_end(stats->rg_hash) )
1213 fprintf(stderr, "[init_group_id] The group ID not unique: \"%s\"\n", key);
1215 k = kh_put(kh_rg, stats->rg_hash, key, &ret);
1216 kh_value(stats->rg_hash, k) = val;
1221 error("The sample or read group \"%s\" not present.\n", id);
1225 void error(const char *format, ...)
1229 printf("Version: %s\n", BAMCHECK_VERSION);
1230 printf("About: The program collects statistics from BAM files. The output can be visualized using plot-bamcheck.\n");
1231 printf("Usage: bamcheck [OPTIONS] file.bam\n");
1232 printf(" bamcheck [OPTIONS] file.bam chr:from-to\n");
1233 printf("Options:\n");
1234 printf(" -c, --coverage <int>,<int>,<int> Coverage distribution min,max,step [1,1000,1]\n");
1235 printf(" -d, --remove-dups Exlude from statistics reads marked as duplicates\n");
1236 printf(" -f, --required-flag <int> Required flag, 0 for unset [0]\n");
1237 printf(" -F, --filtering-flag <int> Filtering flag, 0 for unset [0]\n");
1238 printf(" -h, --help This help message\n");
1239 printf(" -i, --insert-size <int> Maximum insert size [8000]\n");
1240 printf(" -I, --id <string> Include only listed read group or sample name\n");
1241 printf(" -l, --read-length <int> Include in the statistics only reads with the given read length []\n");
1242 printf(" -m, --most-inserts <float> Report only the main part of inserts [0.99]\n");
1243 printf(" -q, --trim-quality <int> The BWA trimming parameter [0]\n");
1244 printf(" -r, --ref-seq <file> Reference sequence (required for GC-depth calculation).\n");
1245 printf(" -t, --target-regions <file> Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n");
1246 printf(" -s, --sam Input is SAM\n");
1252 va_start(ap, format);
1253 vfprintf(stderr, format, ap);
1259 int main(int argc, char *argv[])
1261 char *targets = NULL;
1262 char *bam_fname = NULL;
1263 char *group_id = NULL;
1264 samfile_t *sam = NULL;
1267 stats_t *stats = calloc(1,sizeof(stats_t));
1270 stats->nbases = 300;
1271 stats->nisize = 8000;
1272 stats->max_len = 30;
1273 stats->max_qual = 40;
1274 stats->isize_main_bulk = 0.99; // There are always outliers at the far end
1275 stats->gcd_bin_size = 20000;
1276 stats->ngcd = 3e5; // 300k of 20k bins is enough to hold a genome 6Gbp big
1277 stats->nref_seq = stats->gcd_bin_size;
1278 stats->rseq_pos = -1;
1279 stats->tid = stats->gcd_pos = -1;
1280 stats->is_sorted = 1;
1282 stats->cov_max = 1000;
1283 stats->cov_step = 1;
1286 stats->filter_readlen = -1;
1287 stats->nindels = stats->nbases;
1289 strcpy(in_mode, "rb");
1291 static struct option loptions[] =
1294 {"remove-dups",0,0,'d'},
1296 {"ref-seq",0,0,'r'},
1297 {"coverage",0,0,'c'},
1298 {"read-length",0,0,'l'},
1299 {"insert-size",0,0,'i'},
1300 {"most-inserts",0,0,'m'},
1301 {"trim-quality",0,0,'q'},
1302 {"target-regions",0,0,'t'},
1303 {"required-flag",0,0,'f'},
1304 {"filtering-flag",0,0,'F'},
1309 while ( (opt=getopt_long(argc,argv,"?hdsr:c:l:i:t:m:q:f:F:I:",loptions,NULL))>0 )
1313 case 'f': stats->flag_require=strtol(optarg,0,0); break;
1314 case 'F': stats->flag_filter=strtol(optarg,0,0); break;
1315 case 'd': stats->flag_filter|=BAM_FDUP; break;
1316 case 's': strcpy(in_mode, "r"); break;
1317 case 'r': stats->fai = fai_load(optarg);
1319 error("Could not load faidx: %s\n", optarg);
1321 case 'c': if ( sscanf(optarg,"%d,%d,%d",&stats->cov_min,&stats->cov_max,&stats->cov_step)!= 3 )
1322 error("Unable to parse -c %s\n", optarg);
1324 case 'l': stats->filter_readlen = atoi(optarg); break;
1325 case 'i': stats->nisize = atoi(optarg); break;
1326 case 'm': stats->isize_main_bulk = atof(optarg); break;
1327 case 'q': stats->trim_qual = atoi(optarg); break;
1328 case 't': targets = optarg; break;
1329 case 'I': group_id = optarg; break;
1331 case 'h': error(NULL);
1332 default: error("Unknown argument: %s\n", optarg);
1336 bam_fname = argv[optind++];
1340 if ( isatty(fileno((FILE *)stdin)) )
1346 // .. coverage bins and round buffer
1347 if ( stats->cov_step > stats->cov_max - stats->cov_min + 1 )
1349 stats->cov_step = stats->cov_max - stats->cov_min;
1350 if ( stats->cov_step <= 0 )
1351 stats->cov_step = 1;
1353 stats->ncov = 3 + (stats->cov_max-stats->cov_min) / stats->cov_step;
1354 stats->cov_max = stats->cov_min + ((stats->cov_max-stats->cov_min)/stats->cov_step +1)*stats->cov_step - 1;
1355 stats->cov = calloc(sizeof(uint64_t),stats->ncov);
1356 stats->cov_rbuf.size = stats->nbases*5;
1357 stats->cov_rbuf.buffer = calloc(sizeof(int32_t),stats->cov_rbuf.size);
1359 if ((sam = samopen(bam_fname, in_mode, NULL)) == 0)
1360 error("Failed to open: %s\n", bam_fname);
1362 if ( group_id ) init_group_id(stats, group_id);
1363 bam1_t *bam_line = bam_init1();
1365 stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
1366 stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
1367 stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t));
1368 stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t));
1369 stats->isize_inward = calloc(stats->nisize,sizeof(uint64_t));
1370 stats->isize_outward = calloc(stats->nisize,sizeof(uint64_t));
1371 stats->isize_other = calloc(stats->nisize,sizeof(uint64_t));
1372 stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t));
1373 stats->rseq_buf = calloc(stats->nref_seq,sizeof(uint8_t));
1374 stats->mpc_buf = stats->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL;
1375 stats->acgt_cycles = calloc(4*stats->nbases,sizeof(uint64_t));
1376 stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t));
1377 stats->insertions = calloc(stats->nbases,sizeof(uint64_t));
1378 stats->deletions = calloc(stats->nbases,sizeof(uint64_t));
1379 stats->ins_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
1380 stats->ins_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
1381 stats->del_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
1382 stats->del_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
1384 init_regions(stats, targets);
1386 // Collect statistics
1389 // Collect stats in selected regions only
1390 bam_index_t *bam_idx = bam_index_load(bam_fname);
1392 error("Random alignment retrieval only works for indexed BAM files.\n");
1395 for (i=optind; i<argc; i++)
1398 bam_parse_region(stats->sam->header, argv[i], &tid, &beg, &end);
1399 if ( tid < 0 ) continue;
1400 reset_regions(stats);
1401 bam_fetch(stats->sam->x.bam, bam_idx, tid, beg, end, stats, fetch_read);
1403 bam_index_destroy(bam_idx);
1407 // Stream through the entire BAM ignoring off-target regions if -t is given
1408 while (samread(sam,bam_line) >= 0)
1409 collect_stats(bam_line,stats);
1411 round_buffer_flush(stats,-1);
1413 output_stats(stats);
1415 bam_destroy1(bam_line);
1416 samclose(stats->sam);
1417 if (stats->fai) fai_destroy(stats->fai);
1418 free(stats->cov_rbuf.buffer); free(stats->cov);
1419 free(stats->quals_1st); free(stats->quals_2nd);
1420 free(stats->gc_1st); free(stats->gc_2nd);
1421 free(stats->isize_inward); free(stats->isize_outward); free(stats->isize_other);
1423 free(stats->rseq_buf);
1424 free(stats->mpc_buf);
1425 free(stats->acgt_cycles);
1426 free(stats->read_lengths);
1427 free(stats->insertions);
1428 free(stats->deletions);
1429 free(stats->ins_cycles_1st);
1430 free(stats->ins_cycles_2nd);
1431 free(stats->del_cycles_1st);
1432 free(stats->del_cycles_2nd);
1433 destroy_regions(stats);
1435 if ( stats->rg_hash ) kh_destroy(kh_rg, stats->rg_hash);