2 Author: petr.danecek@sanger
3 gcc -Wall -Winline -g -O2 -I ~/git/samtools bamcheck.c -o bamcheck -lm -lz -L ~/git/samtools -lbam -lpthread
5 Assumptions, approximations and other issues:
6 - GC-depth graph does not split reads, the starting position determines which bin is incremented.
7 There are small overlaps between bins (max readlen-1). However, the bins are big (20k).
8 - coverage distribution ignores softclips and deletions
9 - some stats require sorted BAMs
10 - GC content graph can have an untidy, step-like pattern when BAM contains multiple read lengths.
11 - 'bases mapped' (stats->nbases_mapped) is calculated from read lengths given by BAM (core.l_qseq)
12 - With the -t option, the whole reads are used. Except for the number of mapped bases (cigar)
13 counts, no splicing is done, no indels or soft clips are considered, even small overlap is
14 good enough to include the read in the stats.
18 #define BAMCHECK_VERSION "2012-09-04"
20 #define _ISOC99_SOURCE
32 #include "sam_header.h"
35 #define BWA_MIN_RDLEN 35
36 #define IS_PAIRED(bam) ((bam)->core.flag&BAM_FPAIRED && !((bam)->core.flag&BAM_FUNMAP) && !((bam)->core.flag&BAM_FMUNMAP))
37 #define IS_UNMAPPED(bam) ((bam)->core.flag&BAM_FUNMAP)
38 #define IS_REVERSE(bam) ((bam)->core.flag&BAM_FREVERSE)
39 #define IS_MATE_REVERSE(bam) ((bam)->core.flag&BAM_FMREVERSE)
40 #define IS_READ1(bam) ((bam)->core.flag&BAM_FREAD1)
41 #define IS_READ2(bam) ((bam)->core.flag&BAM_FREAD2)
42 #define IS_DUP(bam) ((bam)->core.flag&BAM_FDUP)
46 int32_t line_len, line_blen;
51 KHASH_MAP_INIT_STR(kh_faidx, faidx1_t)
52 KHASH_MAP_INIT_STR(kh_bam_tid, int)
53 KHASH_MAP_INIT_STR(kh_rg, const char *)
58 khash_t(kh_faidx) *hash;
68 // For coverage distribution, a simple pileup
77 typedef struct { uint32_t from, to; } pos_t;
88 int trim_qual; // bwa trim quality
90 // Dimensions of the quality histogram holder (quals_1st,quals_2nd), GC content holder (gc_1st,gc_2nd),
91 // insert size histogram holder
92 int nquals; // The number of quality bins
93 int nbases; // The maximum sequence length the allocated array can hold
94 int nisize; // The maximum insert size that the allocated array can hold
95 int ngc; // The size of gc_1st and gc_2nd
96 int nindels; // The maximum indel length for indel distribution
98 // Arrays for the histogram data
99 uint64_t *quals_1st, *quals_2nd;
100 uint64_t *gc_1st, *gc_2nd;
101 uint64_t *isize_inward, *isize_outward, *isize_other;
102 uint64_t *acgt_cycles;
103 uint64_t *read_lengths;
104 uint64_t *insertions, *deletions;
105 uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd;
107 // The extremes encountered
108 int max_len; // Maximum read length
109 int max_qual; // Maximum quality
110 float isize_main_bulk; // There are always some unrealistically big insert sizes, report only the main part
115 uint64_t total_len_dup;
119 uint64_t nreads_unmapped;
120 uint64_t nreads_unpaired;
121 uint64_t nreads_paired;
123 uint64_t nbases_mapped;
124 uint64_t nbases_mapped_cigar;
125 uint64_t nbases_trimmed; // bwa trimmed bases
126 uint64_t nmismatches;
128 // GC-depth related data
129 uint32_t ngcd, igcd; // The maximum number of GC depth bins and index of the current bin
130 gc_depth_t *gcd; // The GC-depth bins holder
131 int gcd_bin_size; // The size of GC-depth bin
132 int32_t tid, gcd_pos; // Position of the current bin
133 int32_t pos; // Position of the last read
135 // Coverage distribution related data
136 int ncov; // The number of coverage bins
137 uint64_t *cov; // The coverage frequencies
138 int cov_min,cov_max,cov_step; // Minimum, maximum coverage and size of the coverage bins
139 round_buffer_t cov_rbuf; // Pileup round buffer
141 // Mismatches by read cycle
142 uint8_t *rseq_buf; // A buffer for reference sequence to check the mismatches against
143 int nref_seq; // The size of the buffer
144 int32_t rseq_pos; // The coordinate of the first base in the buffer
145 int32_t rseq_len; // The used part of the buffer
146 uint64_t *mpc_buf; // Mismatches per cycle
152 int nregions, reg_from,reg_to;
156 int flag_require, flag_filter;
157 double sum_qual; // For calculating average quality value
159 khash_t(kh_rg) *rg_hash; // Read groups to include, the array is null-terminated
160 faidx_t *fai; // Reference sequence for GC-depth graph
161 int argc; // Command line arguments to be printed on the output
166 void error(const char *format, ...);
167 void bam_init_header_hash(bam_header_t *header);
168 int is_in_regions(bam1_t *bam_line, stats_t *stats);
171 // Coverage distribution methods
172 inline int coverage_idx(int min, int max, int n, int step, int depth)
180 return 1 + (depth - min) / step;
183 inline int round_buffer_lidx2ridx(int offset, int size, int64_t refpos, int64_t pos)
185 return (offset + (pos-refpos) % size) % size;
188 void round_buffer_flush(stats_t *stats, int64_t pos)
192 if ( pos==stats->cov_rbuf.pos )
195 int64_t new_pos = pos;
196 if ( pos==-1 || pos - stats->cov_rbuf.pos >= stats->cov_rbuf.size )
198 // Flush the whole buffer, but in sequential order,
199 pos = stats->cov_rbuf.pos + stats->cov_rbuf.size - 1;
202 if ( pos < stats->cov_rbuf.pos )
203 error("Expected coordinates in ascending order, got %ld after %ld\n", pos,stats->cov_rbuf.pos);
205 int ifrom = stats->cov_rbuf.start;
206 int ito = round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos-1);
209 for (ibuf=ifrom; ibuf<stats->cov_rbuf.size; ibuf++)
211 if ( !stats->cov_rbuf.buffer[ibuf] )
213 idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
215 stats->cov_rbuf.buffer[ibuf] = 0;
219 for (ibuf=ifrom; ibuf<=ito; ibuf++)
221 if ( !stats->cov_rbuf.buffer[ibuf] )
223 idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
225 stats->cov_rbuf.buffer[ibuf] = 0;
227 stats->cov_rbuf.start = (new_pos==-1) ? 0 : round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos);
228 stats->cov_rbuf.pos = new_pos;
231 void round_buffer_insert_read(round_buffer_t *rbuf, int64_t from, int64_t to)
233 if ( to-from >= rbuf->size )
234 error("The read length too big (%d), please increase the buffer length (currently %d)\n", to-from+1,rbuf->size);
235 if ( from < rbuf->pos )
236 error("The reads are not sorted (%ld comes after %ld).\n", from,rbuf->pos);
239 ifrom = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,from);
240 ito = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,to);
243 for (ibuf=ifrom; ibuf<rbuf->size; ibuf++)
244 rbuf->buffer[ibuf]++;
247 for (ibuf=ifrom; ibuf<=ito; ibuf++)
248 rbuf->buffer[ibuf]++;
251 // Calculate the number of bases in the read trimmed by BWA
252 int bwa_trim_read(int trim_qual, uint8_t *quals, int len, int reverse)
254 if ( len<BWA_MIN_RDLEN ) return 0;
256 // Although the name implies that the read cannot be trimmed to more than BWA_MIN_RDLEN,
257 // the calculation can in fact trim it to (BWA_MIN_RDLEN-1). (bwa_trim_read in bwa/bwaseqio.c).
258 int max_trimmed = len - BWA_MIN_RDLEN + 1;
259 int l, sum=0, max_sum=0, max_l=0;
261 for (l=0; l<max_trimmed; l++)
263 sum += trim_qual - quals[ reverse ? l : len-1-l ];
268 // This is the correct way, but bwa clips from some reason one base less
277 void count_indels(stats_t *stats,bam1_t *bam_line)
279 int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
280 int is_1st = IS_READ1(bam_line) ? 1 : 0;
283 int read_len = bam_line->core.l_qseq;
284 for (icig=0; icig<bam_line->core.n_cigar; icig++)
286 // Conversion from uint32_t to MIDNSHP
289 int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
290 int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
294 int idx = is_fwd ? icycle : read_len-icycle;
296 error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle);
297 if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%d %s\n", idx,stats->nbases, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
299 stats->ins_cycles_1st[idx]++;
301 stats->ins_cycles_2nd[idx]++;
303 if ( ncig<=stats->nindels )
304 stats->insertions[ncig-1]++;
309 int idx = is_fwd ? icycle-1 : read_len-icycle-1;
310 if ( idx<0 ) continue; // discard meaningless deletions
311 if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases);
313 stats->del_cycles_1st[idx]++;
315 stats->del_cycles_2nd[idx]++;
316 if ( ncig<=stats->nindels )
317 stats->deletions[ncig-1]++;
320 if ( cig!=3 && cig!=5 )
325 void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line)
327 int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
328 int icig,iread=0,icycle=0;
329 int iref = bam_line->core.pos - stats->rseq_pos;
330 int read_len = bam_line->core.l_qseq;
331 uint8_t *read = bam1_seq(bam_line);
332 uint8_t *quals = bam1_qual(bam_line);
333 uint64_t *mpc_buf = stats->mpc_buf;
334 for (icig=0; icig<bam_line->core.n_cigar; icig++)
336 // Conversion from uint32_t to MIDNSHP
339 int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
340 int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
355 // Soft-clips are present in the sequence, but the position of the read marks a start of non-clipped sequence
366 error("TODO: cigar %d, %s\n", cig,bam1_qname(bam_line));
368 if ( ncig+iref > stats->rseq_len )
369 error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->rseq_len, bam1_qname(bam_line),stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1);
372 for (im=0; im<ncig; im++)
374 uint8_t cread = bam1_seqi(read,iread);
375 uint8_t cref = stats->rseq_buf[iref];
381 int idx = is_fwd ? icycle : read_len-icycle-1;
382 if ( idx>stats->max_len )
383 error("mpc: %d>%d\n",idx,stats->max_len);
384 idx = idx*stats->nquals;
385 if ( idx>=stats->nquals*stats->nbases )
386 error("FIXME: mpc_buf overflow\n");
389 else if ( cref && cread && cref!=cread )
391 uint8_t qual = quals[iread] + 1;
392 if ( qual>=stats->nquals )
393 error("TODO: quality too high %d>=%d\n", quals[iread],stats->nquals);
395 int idx = is_fwd ? icycle : read_len-icycle-1;
396 if ( idx>stats->max_len )
397 error("mpc: %d>%d\n",idx,stats->max_len);
399 idx = idx*stats->nquals + qual;
400 if ( idx>=stats->nquals*stats->nbases )
401 error("FIXME: mpc_buf overflow\n");
412 void read_ref_seq(stats_t *stats,int32_t tid,int32_t pos)
414 khash_t(kh_faidx) *h;
418 faidx_t *fai = stats->fai;
421 chr = stats->sam->header->target_name[tid];
423 // ID of the sequence name
424 iter = kh_get(kh_faidx, h, chr);
425 if (iter == kh_end(h))
426 error("No such reference sequence [%s]?\n", chr);
427 val = kh_value(h, iter);
429 // Check the boundaries
431 error("Was the bam file mapped with the reference sequence supplied?"
432 " A read mapped beyond the end of the chromosome (%s:%d, chromosome length %d).\n", chr,pos,val.len);
433 int size = stats->nref_seq;
434 // The buffer extends beyond the chromosome end. Later the rest will be filled with N's.
435 if (size+pos > val.len) size = val.len-pos;
437 // Position the razf reader
438 razf_seek(fai->rz, val.offset + pos / val.line_blen * val.line_len + pos % val.line_blen, SEEK_SET);
440 uint8_t *ptr = stats->rseq_buf;
442 while ( nread<size && razf_read(fai->rz,&c,1) && !fai->rz->z_err )
447 // Conversion between uint8_t coding and ACGT
450 if ( c=='A' || c=='a' )
452 else if ( c=='C' || c=='c' )
454 else if ( c=='G' || c=='g' )
456 else if ( c=='T' || c=='t' )
463 if ( nread < stats->nref_seq )
465 memset(ptr,0, stats->nref_seq - nread);
466 nread = stats->nref_seq;
468 stats->rseq_len = nread;
469 stats->rseq_pos = pos;
473 float fai_gc_content(stats_t *stats)
476 int i,size = stats->rseq_len;
480 for (i=0; i<size; i++)
482 c = stats->rseq_buf[i];
488 else if ( c==1 || c==8 )
491 return count ? (float)gc/count : 0;
495 void realloc_buffers(stats_t *stats, int seq_len)
497 int n = 2*(1 + seq_len - stats->nbases) + stats->nbases;
499 stats->quals_1st = realloc(stats->quals_1st, n*stats->nquals*sizeof(uint64_t));
500 if ( !stats->quals_1st )
501 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
502 memset(stats->quals_1st + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
504 stats->quals_2nd = realloc(stats->quals_2nd, n*stats->nquals*sizeof(uint64_t));
505 if ( !stats->quals_2nd )
506 error("Could not realloc buffers, the sequence too long: %d (2x%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
507 memset(stats->quals_2nd + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
509 if ( stats->mpc_buf )
511 stats->mpc_buf = realloc(stats->mpc_buf, n*stats->nquals*sizeof(uint64_t));
512 if ( !stats->mpc_buf )
513 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
514 memset(stats->mpc_buf + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
517 stats->acgt_cycles = realloc(stats->acgt_cycles, n*4*sizeof(uint64_t));
518 if ( !stats->acgt_cycles )
519 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*4*sizeof(uint64_t));
520 memset(stats->acgt_cycles + stats->nbases*4, 0, (n-stats->nbases)*4*sizeof(uint64_t));
522 stats->read_lengths = realloc(stats->read_lengths, n*sizeof(uint64_t));
523 if ( !stats->read_lengths )
524 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
525 memset(stats->read_lengths + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
527 stats->insertions = realloc(stats->insertions, n*sizeof(uint64_t));
528 if ( !stats->insertions )
529 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
530 memset(stats->insertions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
532 stats->deletions = realloc(stats->deletions, n*sizeof(uint64_t));
533 if ( !stats->deletions )
534 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
535 memset(stats->deletions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
537 stats->ins_cycles_1st = realloc(stats->ins_cycles_1st, (n+1)*sizeof(uint64_t));
538 if ( !stats->ins_cycles_1st )
539 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
540 memset(stats->ins_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
542 stats->ins_cycles_2nd = realloc(stats->ins_cycles_2nd, (n+1)*sizeof(uint64_t));
543 if ( !stats->ins_cycles_2nd )
544 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
545 memset(stats->ins_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
547 stats->del_cycles_1st = realloc(stats->del_cycles_1st, (n+1)*sizeof(uint64_t));
548 if ( !stats->del_cycles_1st )
549 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
550 memset(stats->del_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
552 stats->del_cycles_2nd = realloc(stats->del_cycles_2nd, (n+1)*sizeof(uint64_t));
553 if ( !stats->del_cycles_2nd )
554 error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
555 memset(stats->del_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
559 // Realloc the coverage distribution buffer
560 int *rbuffer = calloc(sizeof(int),seq_len*5);
561 n = stats->cov_rbuf.size-stats->cov_rbuf.start;
562 memcpy(rbuffer,stats->cov_rbuf.buffer+stats->cov_rbuf.start,n);
563 if ( stats->cov_rbuf.start>1 )
564 memcpy(rbuffer+n,stats->cov_rbuf.buffer,stats->cov_rbuf.start);
565 stats->cov_rbuf.start = 0;
566 free(stats->cov_rbuf.buffer);
567 stats->cov_rbuf.buffer = rbuffer;
568 stats->cov_rbuf.size = seq_len*5;
571 void collect_stats(bam1_t *bam_line, stats_t *stats)
573 if ( stats->rg_hash )
575 const uint8_t *rg = bam_aux_get(bam_line, "RG");
577 khiter_t k = kh_get(kh_rg, stats->rg_hash, (const char*)(rg + 1));
578 if ( k == kh_end(stats->rg_hash) ) return;
580 if ( stats->flag_require && (bam_line->core.flag & stats->flag_require)!=stats->flag_require )
582 if ( stats->flag_filter && (bam_line->core.flag & stats->flag_filter) )
585 if ( !is_in_regions(bam_line,stats) )
588 int seq_len = bam_line->core.l_qseq;
589 if ( !seq_len ) return;
590 if ( stats->filter_readlen!=-1 && seq_len!=stats->filter_readlen ) return;
591 if ( seq_len >= stats->nbases )
592 realloc_buffers(stats,seq_len);
593 if ( stats->max_len<seq_len )
594 stats->max_len = seq_len;
596 stats->read_lengths[seq_len]++;
598 // Count GC and ACGT per cycle
599 uint8_t base, *seq = bam1_seq(bam_line);
602 int reverse = IS_REVERSE(bam_line);
603 for (i=0; i<seq_len; i++)
605 // Conversion from uint8_t coding to ACGT
609 base = bam1_seqi(seq,i);
611 if ( base==1 || base==2 ) gc_count++;
612 else if ( base>2 ) base=3;
613 if ( 4*(reverse ? seq_len-i-1 : i) + base >= stats->nbases*4 )
614 error("FIXME: acgt_cycles\n");
615 stats->acgt_cycles[ 4*(reverse ? seq_len-i-1 : i) + base ]++;
617 int gc_idx_min = gc_count*(stats->ngc-1)/seq_len;
618 int gc_idx_max = (gc_count+1)*(stats->ngc-1)/seq_len;
619 if ( gc_idx_max >= stats->ngc ) gc_idx_max = stats->ngc - 1;
621 // Determine which array (1st or 2nd read) will these stats go to,
622 // trim low quality bases from end the same way BWA does,
625 uint8_t *bam_quals = bam1_qual(bam_line);
626 if ( bam_line->core.flag&BAM_FREAD2 )
628 quals = stats->quals_2nd;
630 for (i=gc_idx_min; i<gc_idx_max; i++)
635 quals = stats->quals_1st;
637 for (i=gc_idx_min; i<gc_idx_max; i++)
640 if ( stats->trim_qual>0 )
641 stats->nbases_trimmed += bwa_trim_read(stats->trim_qual, bam_quals, seq_len, reverse);
643 // Quality histogram and average quality
644 for (i=0; i<seq_len; i++)
646 uint8_t qual = bam_quals[ reverse ? seq_len-i-1 : i];
647 if ( qual>=stats->nquals )
648 error("TODO: quality too high %d>=%d\n", quals[i],stats->nquals);
649 if ( qual>stats->max_qual )
650 stats->max_qual = qual;
652 quals[ i*stats->nquals+qual ]++;
653 stats->sum_qual += qual;
656 // Look at the flags and increment appropriate counters (mapped, paired, etc)
657 if ( IS_UNMAPPED(bam_line) )
658 stats->nreads_unmapped++;
661 if ( !bam_line->core.qual )
664 count_indels(stats,bam_line);
666 // The insert size is tricky, because for long inserts the libraries are
667 // prepared differently and the pairs point in other direction. BWA does
668 // not set the paired flag for them. Similar thing is true also for 454
669 // reads. Therefore, do the insert size stats for all mapped reads.
670 int32_t isize = bam_line->core.isize;
671 if ( isize<0 ) isize = -isize;
672 if ( IS_PAIRED(bam_line) && isize!=0 )
674 stats->nreads_paired++;
675 if ( isize >= stats->nisize )
676 isize=stats->nisize-1;
678 int pos_fst = bam_line->core.mpos - bam_line->core.pos;
679 int is_fst = IS_READ1(bam_line) ? 1 : -1;
680 int is_fwd = IS_REVERSE(bam_line) ? -1 : 1;
681 int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1;
683 if ( is_fwd*is_mfwd>0 )
684 stats->isize_other[isize]++;
685 else if ( is_fst*pos_fst>0 )
687 if ( is_fst*is_fwd>0 )
688 stats->isize_inward[isize]++;
690 stats->isize_outward[isize]++;
692 else if ( is_fst*pos_fst<0 )
694 if ( is_fst*is_fwd>0 )
695 stats->isize_outward[isize]++;
697 stats->isize_inward[isize]++;
701 stats->nreads_unpaired++;
703 // Number of mismatches
704 uint8_t *nm = bam_aux_get(bam_line,"NM");
706 stats->nmismatches += bam_aux2i(nm);
708 // Number of mapped bases from cigar
709 // Conversion from uint32_t to MIDNSHP
712 if ( bam_line->core.n_cigar == 0)
713 error("FIXME: mapped read with no cigar?\n");
715 if ( stats->regions )
717 // Count only on-target bases
718 int iref = bam_line->core.pos + 1;
719 for (i=0; i<bam_line->core.n_cigar; i++)
721 int cig = bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK;
722 int ncig = bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
723 if ( cig==2 ) readlen += ncig;
726 if ( iref < stats->reg_from ) ncig -= stats->reg_from-iref;
727 else if ( iref+ncig-1 > stats->reg_to ) ncig -= iref+ncig-1 - stats->reg_to;
728 if ( ncig<0 ) ncig = 0;
729 stats->nbases_mapped_cigar += ncig;
730 iref += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
735 if ( iref>=stats->reg_from && iref<=stats->reg_to )
736 stats->nbases_mapped_cigar += ncig;
742 // Count the whole read
743 for (i=0; i<bam_line->core.n_cigar; i++)
745 if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==0 || (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==1 )
746 stats->nbases_mapped_cigar += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
747 if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==2 )
748 readlen += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
751 stats->nbases_mapped += seq_len;
753 if ( stats->tid==bam_line->core.tid && bam_line->core.pos<stats->pos )
754 stats->is_sorted = 0;
755 stats->pos = bam_line->core.pos;
757 if ( stats->is_sorted )
759 if ( stats->tid==-1 || stats->tid!=bam_line->core.tid )
760 round_buffer_flush(stats,-1);
762 // Mismatches per cycle and GC-depth graph
765 // First pass, new chromosome or sequence spanning beyond the end of the buffer
766 if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid || stats->rseq_pos+stats->rseq_len < bam_line->core.pos+readlen )
768 read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos);
770 // Get the reference GC content for this bin. Note that in the current implementation
771 // the GCD bins overlap by seq_len-1. Because the bin size is by default 20k and the
772 // expected read length only ~100bp, it shouldn't really matter.
773 stats->gcd_pos = bam_line->core.pos;
776 if ( stats->igcd >= stats->ngcd )
777 error("The genome too long or the GCD bin overlaps too big [%ud]\n", stats->igcd);
779 stats->gcd[ stats->igcd ].gc = fai_gc_content(stats);
781 count_mismatches_per_cycle(stats,bam_line);
783 else if ( stats->gcd_pos==-1 || stats->tid != bam_line->core.tid || bam_line->core.pos - stats->gcd_pos > stats->gcd_bin_size )
785 // First pass or a new chromosome
786 stats->tid = bam_line->core.tid;
787 stats->gcd_pos = bam_line->core.pos;
790 if ( stats->igcd >= stats->ngcd )
792 uint32_t n = 2*(1 + stats->ngcd);
793 stats->gcd = realloc(stats->gcd, n*sizeof(gc_depth_t));
795 error("Could not realloc GCD buffer, too many chromosomes or the genome too long?? [%u %u]\n", stats->ngcd,n);
796 memset(&(stats->gcd[stats->ngcd]),0,(n-stats->ngcd)*sizeof(gc_depth_t));
800 stats->gcd[ stats->igcd ].depth++;
801 // When no reference sequence is given, approximate the GC from the read (much shorter window, but otherwise OK)
803 stats->gcd[ stats->igcd ].gc += (float) gc_count / seq_len;
805 // Coverage distribution graph
806 round_buffer_flush(stats,bam_line->core.pos);
807 round_buffer_insert_read(&(stats->cov_rbuf),bam_line->core.pos,bam_line->core.pos+seq_len-1);
811 stats->total_len += seq_len;
812 if ( IS_DUP(bam_line) )
814 stats->total_len_dup += seq_len;
819 // Sort by GC and depth
820 #define GCD_t(x) ((gc_depth_t *)x)
821 static int gcd_cmp(const void *a, const void *b)
823 if ( GCD_t(a)->gc < GCD_t(b)->gc ) return -1;
824 if ( GCD_t(a)->gc > GCD_t(b)->gc ) return 1;
825 if ( GCD_t(a)->depth < GCD_t(b)->depth ) return -1;
826 if ( GCD_t(a)->depth > GCD_t(b)->depth ) return 1;
831 float gcd_percentile(gc_depth_t *gcd, int N, int p)
841 return gcd[N-1].depth;
844 return gcd[k-1].depth + d*(gcd[k].depth - gcd[k-1].depth);
847 void output_stats(stats_t *stats)
849 // Calculate average insert size and standard deviation (from the main bulk data only)
851 uint64_t nisize=0, nisize_inward=0, nisize_outward=0, nisize_other=0;
852 for (isize=1; isize<stats->nisize; isize++)
854 // Each pair was counted twice
855 stats->isize_inward[isize] *= 0.5;
856 stats->isize_outward[isize] *= 0.5;
857 stats->isize_other[isize] *= 0.5;
859 nisize_inward += stats->isize_inward[isize];
860 nisize_outward += stats->isize_outward[isize];
861 nisize_other += stats->isize_other[isize];
862 nisize += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
865 double bulk=0, avg_isize=0, sd_isize=0;
866 for (isize=1; isize<stats->nisize; isize++)
868 bulk += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
869 avg_isize += isize * (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]);
871 if ( bulk/nisize > stats->isize_main_bulk )
878 avg_isize /= nisize ? nisize : 1;
879 for (isize=1; isize<ibulk; isize++)
880 sd_isize += (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]) * (isize-avg_isize)*(isize-avg_isize) / nisize;
881 sd_isize = sqrt(sd_isize);
884 printf("# This file was produced by bamcheck (%s)\n",BAMCHECK_VERSION);
885 printf("# The command line was: %s",stats->argv[0]);
887 for (i=1; i<stats->argc; i++)
888 printf(" %s",stats->argv[i]);
890 printf("# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n");
891 printf("SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd));
892 printf("SN\tis paired:\t%d\n", stats->nreads_1st&&stats->nreads_2nd ? 1 : 0);
893 printf("SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0);
894 printf("SN\t1st fragments:\t%ld\n", (long)stats->nreads_1st);
895 printf("SN\tlast fragments:\t%ld\n", (long)stats->nreads_2nd);
896 printf("SN\treads mapped:\t%ld\n", (long)(stats->nreads_paired+stats->nreads_unpaired));
897 printf("SN\treads unmapped:\t%ld\n", (long)stats->nreads_unmapped);
898 printf("SN\treads unpaired:\t%ld\n", (long)stats->nreads_unpaired);
899 printf("SN\treads paired:\t%ld\n", (long)stats->nreads_paired);
900 printf("SN\treads duplicated:\t%ld\n", (long)stats->nreads_dup);
901 printf("SN\treads MQ0:\t%ld\n", (long)stats->nreads_mq0);
902 printf("SN\ttotal length:\t%ld\n", (long)stats->total_len);
903 printf("SN\tbases mapped:\t%ld\n", (long)stats->nbases_mapped);
904 printf("SN\tbases mapped (cigar):\t%ld\n", (long)stats->nbases_mapped_cigar);
905 printf("SN\tbases trimmed:\t%ld\n", (long)stats->nbases_trimmed);
906 printf("SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup);
907 printf("SN\tmismatches:\t%ld\n", (long)stats->nmismatches);
908 printf("SN\terror rate:\t%e\n", (float)stats->nmismatches/stats->nbases_mapped_cigar);
909 float avg_read_length = (stats->nreads_1st+stats->nreads_2nd)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd):0;
910 printf("SN\taverage length:\t%.0f\n", avg_read_length);
911 printf("SN\tmaximum length:\t%d\n", stats->max_len);
912 printf("SN\taverage quality:\t%.1f\n", stats->total_len?stats->sum_qual/stats->total_len:0);
913 printf("SN\tinsert size average:\t%.1f\n", avg_isize);
914 printf("SN\tinsert size standard deviation:\t%.1f\n", sd_isize);
915 printf("SN\tinward oriented pairs:\t%ld\n", (long)nisize_inward);
916 printf("SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward);
917 printf("SN\tpairs with other orientation:\t%ld\n", (long)nisize_other);
920 if ( stats->max_len<stats->nbases ) stats->max_len++;
921 if ( stats->max_qual+1<stats->nquals ) stats->max_qual++;
922 printf("# First Fragment Qualitites. Use `grep ^FFQ | cut -f 2-` to extract this part.\n");
923 printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
924 for (ibase=0; ibase<stats->max_len; ibase++)
926 printf("FFQ\t%d",ibase+1);
927 for (iqual=0; iqual<=stats->max_qual; iqual++)
929 printf("\t%ld", (long)stats->quals_1st[ibase*stats->nquals+iqual]);
933 printf("# Last Fragment Qualitites. Use `grep ^LFQ | cut -f 2-` to extract this part.\n");
934 printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
935 for (ibase=0; ibase<stats->max_len; ibase++)
937 printf("LFQ\t%d",ibase+1);
938 for (iqual=0; iqual<=stats->max_qual; iqual++)
940 printf("\t%ld", (long)stats->quals_2nd[ibase*stats->nquals+iqual]);
944 if ( stats->mpc_buf )
946 printf("# Mismatches per cycle and quality. Use `grep ^MPC | cut -f 2-` to extract this part.\n");
947 printf("# Columns correspond to qualities, rows to cycles. First column is the cycle number, second\n");
948 printf("# is the number of N's and the rest is the number of mismatches\n");
949 for (ibase=0; ibase<stats->max_len; ibase++)
951 printf("MPC\t%d",ibase+1);
952 for (iqual=0; iqual<=stats->max_qual; iqual++)
954 printf("\t%ld", (long)stats->mpc_buf[ibase*stats->nquals+iqual]);
959 printf("# GC Content of first fragments. Use `grep ^GCF | cut -f 2-` to extract this part.\n");
961 for (ibase=0; ibase<stats->ngc; ibase++)
963 if ( stats->gc_1st[ibase]==stats->gc_1st[ibase_prev] ) continue;
964 printf("GCF\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_1st[ibase_prev]);
967 printf("# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part.\n");
969 for (ibase=0; ibase<stats->ngc; ibase++)
971 if ( stats->gc_2nd[ibase]==stats->gc_2nd[ibase_prev] ) continue;
972 printf("GCL\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_2nd[ibase_prev]);
975 printf("# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle, and A,C,G,T counts [%%]\n");
976 for (ibase=0; ibase<stats->max_len; ibase++)
978 uint64_t *ptr = &(stats->acgt_cycles[ibase*4]);
979 uint64_t sum = ptr[0]+ptr[1]+ptr[2]+ptr[3];
980 if ( ! sum ) continue;
981 printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum);
983 printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n");
984 for (isize=1; isize<ibulk; isize++)
985 printf("IS\t%d\t%ld\t%ld\t%ld\t%ld\n", isize, (long)(stats->isize_inward[isize]+stats->isize_outward[isize]+stats->isize_other[isize]),
986 (long)stats->isize_inward[isize], (long)stats->isize_outward[isize], (long)stats->isize_other[isize]);
988 printf("# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count\n");
990 for (ilen=0; ilen<stats->max_len; ilen++)
992 if ( stats->read_lengths[ilen]>0 )
993 printf("RL\t%d\t%ld\n", ilen, (long)stats->read_lengths[ilen]);
996 printf("# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n");
997 for (ilen=0; ilen<stats->nindels; ilen++)
999 if ( stats->insertions[ilen]>0 || stats->deletions[ilen]>0 )
1000 printf("ID\t%d\t%ld\t%ld\n", ilen+1, (long)stats->insertions[ilen], (long)stats->deletions[ilen]);
1003 printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev)\n");
1004 for (ilen=0; ilen<=stats->nbases; ilen++)
1006 if ( stats->ins_cycles_1st[ilen]>0 || stats->ins_cycles_2nd[ilen]>0 || stats->del_cycles_1st[ilen]>0 || stats->del_cycles_2nd[ilen]>0 )
1007 printf("IC\t%d\t%ld\t%ld\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles_1st[ilen], (long)stats->ins_cycles_2nd[ilen], (long)stats->del_cycles_1st[ilen], (long)stats->del_cycles_2nd[ilen]);
1010 printf("# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n");
1011 printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1, (long)stats->cov[0]);
1013 for (icov=1; icov<stats->ncov-1; icov++)
1014 printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1, (long)stats->cov[icov]);
1015 printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1, (long)stats->cov[stats->ncov-1]);
1018 // Calculate average GC content, then sort by GC and depth
1019 printf("# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n");
1021 for (igcd=0; igcd<stats->igcd; igcd++)
1024 stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc);
1026 if ( stats->gcd[igcd].depth )
1027 stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc / stats->gcd[igcd].depth);
1029 qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp);
1031 while ( igcd < stats->igcd )
1033 // Calculate percentiles (10,25,50,75,90th) for the current GC content and print
1034 uint32_t nbins=0, itmp=igcd;
1035 float gc = stats->gcd[igcd].gc;
1036 while ( itmp<stats->igcd && fabs(stats->gcd[itmp].gc-gc)<0.1 )
1041 printf("GCD\t%.1f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", gc, (igcd+nbins+1)*100./(stats->igcd+1),
1042 gcd_percentile(&(stats->gcd[igcd]),nbins,10) *avg_read_length/stats->gcd_bin_size,
1043 gcd_percentile(&(stats->gcd[igcd]),nbins,25) *avg_read_length/stats->gcd_bin_size,
1044 gcd_percentile(&(stats->gcd[igcd]),nbins,50) *avg_read_length/stats->gcd_bin_size,
1045 gcd_percentile(&(stats->gcd[igcd]),nbins,75) *avg_read_length/stats->gcd_bin_size,
1046 gcd_percentile(&(stats->gcd[igcd]),nbins,90) *avg_read_length/stats->gcd_bin_size
1052 size_t mygetline(char **line, size_t *n, FILE *fp)
1054 if (line == NULL || n == NULL || fp == NULL)
1059 if (*n==0 || !*line)
1067 while ((c=getc(fp))!= EOF && c!='\n')
1072 *line = realloc(*line, sizeof(char)*(*n));
1074 (*line)[nread-1] = c;
1079 *line = realloc(*line, sizeof(char)*(*n));
1082 return nread>0 ? nread : -1;
1086 void init_regions(stats_t *stats, char *file)
1089 khash_t(kh_bam_tid) *header_hash;
1091 bam_init_header_hash(stats->sam->header);
1092 header_hash = (khash_t(kh_bam_tid)*)stats->sam->header->hash;
1094 FILE *fp = fopen(file,"r");
1095 if ( !fp ) error("%s: %s\n",file,strerror(errno));
1101 int prev_tid=-1, prev_pos=-1;
1102 while ((nread = mygetline(&line, &len, fp)) != -1)
1104 if ( line[0] == '#' ) continue;
1107 while ( i<nread && !isspace(line[i]) ) i++;
1108 if ( i>=nread ) error("Could not parse the file: %s [%s]\n", file,line);
1111 iter = kh_get(kh_bam_tid, header_hash, line);
1112 int tid = kh_val(header_hash, iter);
1113 if ( iter == kh_end(header_hash) )
1116 fprintf(stderr,"Warning: Some sequences not present in the BAM, e.g. \"%s\". This message is printed only once.\n", line);
1121 if ( tid >= stats->nregions )
1123 stats->regions = realloc(stats->regions,sizeof(regions_t)*(stats->nregions+100));
1125 for (j=stats->nregions; j<stats->nregions+100; j++)
1127 stats->regions[j].npos = stats->regions[j].mpos = stats->regions[j].cpos = 0;
1128 stats->regions[j].pos = NULL;
1130 stats->nregions += 100;
1132 int npos = stats->regions[tid].npos;
1133 if ( npos >= stats->regions[tid].mpos )
1135 stats->regions[tid].mpos += 1000;
1136 stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos);
1139 if ( (sscanf(line+i+1,"%d %d",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n");
1140 if ( prev_tid==-1 || prev_tid!=tid )
1143 prev_pos = stats->regions[tid].pos[npos].from;
1145 if ( prev_pos>stats->regions[tid].pos[npos].from )
1146 error("The positions are not in chromosomal order (%s:%d comes after %d)\n", line,stats->regions[tid].pos[npos].from,prev_pos);
1147 stats->regions[tid].npos++;
1149 if (line) free(line);
1150 if ( !stats->regions ) error("Unable to map the -t sequences to the BAM sequences.\n");
1154 void destroy_regions(stats_t *stats)
1157 for (i=0; i<stats->nregions; i++)
1159 if ( !stats->regions[i].mpos ) continue;
1160 free(stats->regions[i].pos);
1162 if ( stats->regions ) free(stats->regions);
1165 static int fetch_read(const bam1_t *bam_line, void *data)
1167 collect_stats((bam1_t*)bam_line,(stats_t*)data);
1171 void reset_regions(stats_t *stats)
1174 for (i=0; i<stats->nregions; i++)
1175 stats->regions[i].cpos = 0;
1178 int is_in_regions(bam1_t *bam_line, stats_t *stats)
1180 if ( !stats->regions ) return 1;
1182 if ( bam_line->core.tid >= stats->nregions || bam_line->core.tid<0 ) return 0;
1183 if ( !stats->is_sorted ) error("The BAM must be sorted in order for -t to work.\n");
1185 regions_t *reg = &stats->regions[bam_line->core.tid];
1186 if ( reg->cpos==reg->npos ) return 0; // done for this chr
1188 // Find a matching interval or skip this read. No splicing of reads is done, no indels or soft clips considered,
1189 // even small overlap is enough to include the read in the stats.
1191 while ( i<reg->npos && reg->pos[i].to<=bam_line->core.pos ) i++;
1192 if ( i>=reg->npos ) { reg->cpos = reg->npos; return 0; }
1193 if ( bam_line->core.pos + bam_line->core.l_qseq + 1 < reg->pos[i].from ) return 0;
1195 stats->reg_from = reg->pos[i].from;
1196 stats->reg_to = reg->pos[i].to;
1201 void init_group_id(stats_t *stats, char *id)
1203 if ( !stats->sam->header->dict )
1204 stats->sam->header->dict = sam_header_parse2(stats->sam->header->text);
1205 void *iter = stats->sam->header->dict;
1206 const char *key, *val;
1208 stats->rg_hash = kh_init(kh_rg);
1209 while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) )
1211 if ( !strcmp(id,key) || (val && !strcmp(id,val)) )
1213 khiter_t k = kh_get(kh_rg, stats->rg_hash, key);
1214 if ( k != kh_end(stats->rg_hash) )
1215 fprintf(stderr, "[init_group_id] The group ID not unique: \"%s\"\n", key);
1217 k = kh_put(kh_rg, stats->rg_hash, key, &ret);
1218 kh_value(stats->rg_hash, k) = val;
1223 error("The sample or read group \"%s\" not present.\n", id);
1227 void error(const char *format, ...)
1231 printf("Version: %s\n", BAMCHECK_VERSION);
1232 printf("About: The program collects statistics from BAM files. The output can be visualized using plot-bamcheck.\n");
1233 printf("Usage: bamcheck [OPTIONS] file.bam\n");
1234 printf(" bamcheck [OPTIONS] file.bam chr:from-to\n");
1235 printf("Options:\n");
1236 printf(" -c, --coverage <int>,<int>,<int> Coverage distribution min,max,step [1,1000,1]\n");
1237 printf(" -d, --remove-dups Exlude from statistics reads marked as duplicates\n");
1238 printf(" -f, --required-flag <int> Required flag, 0 for unset [0]\n");
1239 printf(" -F, --filtering-flag <int> Filtering flag, 0 for unset [0]\n");
1240 printf(" --GC-depth <float,float> Bin size for GC-depth graph and the maximum reference length [2e4,6e9]\n");
1241 printf(" -h, --help This help message\n");
1242 printf(" -i, --insert-size <int> Maximum insert size [8000]\n");
1243 printf(" -I, --id <string> Include only listed read group or sample name\n");
1244 printf(" -l, --read-length <int> Include in the statistics only reads with the given read length []\n");
1245 printf(" -m, --most-inserts <float> Report only the main part of inserts [0.99]\n");
1246 printf(" -q, --trim-quality <int> The BWA trimming parameter [0]\n");
1247 printf(" -r, --ref-seq <file> Reference sequence (required for GC-depth calculation).\n");
1248 printf(" -t, --target-regions <file> Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n");
1249 printf(" -s, --sam Input is SAM\n");
1255 va_start(ap, format);
1256 vfprintf(stderr, format, ap);
1262 int main(int argc, char *argv[])
1264 char *targets = NULL;
1265 char *bam_fname = NULL;
1266 char *group_id = NULL;
1267 samfile_t *sam = NULL;
1270 stats_t *stats = calloc(1,sizeof(stats_t));
1273 stats->nbases = 300;
1274 stats->nisize = 8000;
1275 stats->max_len = 30;
1276 stats->max_qual = 40;
1277 stats->isize_main_bulk = 0.99; // There are always outliers at the far end
1278 stats->gcd_bin_size = 20000;
1279 stats->ngcd = 3e5; // 300k of 20k bins is enough to hold a genome 6Gbp big
1280 stats->rseq_pos = -1;
1281 stats->tid = stats->gcd_pos = -1;
1282 stats->is_sorted = 1;
1284 stats->cov_max = 1000;
1285 stats->cov_step = 1;
1288 stats->filter_readlen = -1;
1289 stats->nindels = stats->nbases;
1291 strcpy(in_mode, "rb");
1293 static struct option loptions[] =
1296 {"remove-dups",0,0,'d'},
1298 {"ref-seq",1,0,'r'},
1299 {"coverage",1,0,'c'},
1300 {"read-length",1,0,'l'},
1301 {"insert-size",1,0,'i'},
1302 {"most-inserts",1,0,'m'},
1303 {"trim-quality",1,0,'q'},
1304 {"target-regions",0,0,'t'},
1305 {"required-flag",1,0,'f'},
1306 {"filtering-flag",0,0,'F'},
1312 while ( (opt=getopt_long(argc,argv,"?hdsr:c:l:i:t:m:q:f:F:I:1:",loptions,NULL))>0 )
1316 case 'f': stats->flag_require=strtol(optarg,0,0); break;
1317 case 'F': stats->flag_filter=strtol(optarg,0,0); break;
1318 case 'd': stats->flag_filter|=BAM_FDUP; break;
1319 case 's': strcpy(in_mode, "r"); break;
1320 case 'r': stats->fai = fai_load(optarg);
1322 error("Could not load faidx: %s\n", optarg);
1326 if ( sscanf(optarg,"%f,%f",&fbin,&flen)!= 2 )
1327 error("Unable to parse --GC-depth %s\n", optarg);
1328 stats->gcd_bin_size = fbin;
1329 stats->ngcd = flen/fbin;
1332 case 'c': if ( sscanf(optarg,"%d,%d,%d",&stats->cov_min,&stats->cov_max,&stats->cov_step)!= 3 )
1333 error("Unable to parse -c %s\n", optarg);
1335 case 'l': stats->filter_readlen = atoi(optarg); break;
1336 case 'i': stats->nisize = atoi(optarg); break;
1337 case 'm': stats->isize_main_bulk = atof(optarg); break;
1338 case 'q': stats->trim_qual = atoi(optarg); break;
1339 case 't': targets = optarg; break;
1340 case 'I': group_id = optarg; break;
1342 case 'h': error(NULL);
1343 default: error("Unknown argument: %s\n", optarg);
1347 bam_fname = argv[optind++];
1351 if ( isatty(fileno((FILE *)stdin)) )
1357 // .. coverage bins and round buffer
1358 if ( stats->cov_step > stats->cov_max - stats->cov_min + 1 )
1360 stats->cov_step = stats->cov_max - stats->cov_min;
1361 if ( stats->cov_step <= 0 )
1362 stats->cov_step = 1;
1364 stats->ncov = 3 + (stats->cov_max-stats->cov_min) / stats->cov_step;
1365 stats->cov_max = stats->cov_min + ((stats->cov_max-stats->cov_min)/stats->cov_step +1)*stats->cov_step - 1;
1366 stats->cov = calloc(sizeof(uint64_t),stats->ncov);
1367 stats->cov_rbuf.size = stats->nbases*5;
1368 stats->cov_rbuf.buffer = calloc(sizeof(int32_t),stats->cov_rbuf.size);
1370 if ((sam = samopen(bam_fname, in_mode, NULL)) == 0)
1371 error("Failed to open: %s\n", bam_fname);
1373 if ( group_id ) init_group_id(stats, group_id);
1374 bam1_t *bam_line = bam_init1();
1376 stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
1377 stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
1378 stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t));
1379 stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t));
1380 stats->isize_inward = calloc(stats->nisize,sizeof(uint64_t));
1381 stats->isize_outward = calloc(stats->nisize,sizeof(uint64_t));
1382 stats->isize_other = calloc(stats->nisize,sizeof(uint64_t));
1383 stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t));
1384 stats->nref_seq = stats->gcd_bin_size;
1385 stats->rseq_buf = calloc(stats->nref_seq,sizeof(uint8_t));
1386 stats->mpc_buf = stats->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL;
1387 stats->acgt_cycles = calloc(4*stats->nbases,sizeof(uint64_t));
1388 stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t));
1389 stats->insertions = calloc(stats->nbases,sizeof(uint64_t));
1390 stats->deletions = calloc(stats->nbases,sizeof(uint64_t));
1391 stats->ins_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
1392 stats->ins_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
1393 stats->del_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
1394 stats->del_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
1396 init_regions(stats, targets);
1398 // Collect statistics
1401 // Collect stats in selected regions only
1402 bam_index_t *bam_idx = bam_index_load(bam_fname);
1404 error("Random alignment retrieval only works for indexed BAM files.\n");
1407 for (i=optind; i<argc; i++)
1410 bam_parse_region(stats->sam->header, argv[i], &tid, &beg, &end);
1411 if ( tid < 0 ) continue;
1412 reset_regions(stats);
1413 bam_fetch(stats->sam->x.bam, bam_idx, tid, beg, end, stats, fetch_read);
1415 bam_index_destroy(bam_idx);
1419 // Stream through the entire BAM ignoring off-target regions if -t is given
1420 while (samread(sam,bam_line) >= 0)
1421 collect_stats(bam_line,stats);
1423 round_buffer_flush(stats,-1);
1425 output_stats(stats);
1427 bam_destroy1(bam_line);
1428 samclose(stats->sam);
1429 if (stats->fai) fai_destroy(stats->fai);
1430 free(stats->cov_rbuf.buffer); free(stats->cov);
1431 free(stats->quals_1st); free(stats->quals_2nd);
1432 free(stats->gc_1st); free(stats->gc_2nd);
1433 free(stats->isize_inward); free(stats->isize_outward); free(stats->isize_other);
1435 free(stats->rseq_buf);
1436 free(stats->mpc_buf);
1437 free(stats->acgt_cycles);
1438 free(stats->read_lengths);
1439 free(stats->insertions);
1440 free(stats->deletions);
1441 free(stats->ins_cycles_1st);
1442 free(stats->ins_cycles_2nd);
1443 free(stats->del_cycles_1st);
1444 free(stats->del_cycles_2nd);
1445 destroy_regions(stats);
1447 if ( stats->rg_hash ) kh_destroy(kh_rg, stats->rg_hash);