#include "faidx.h"
#include "khash.h"
#include "sam.h"
+#include "sam_header.h"
#include "razf.h"
#define BWA_MIN_RDLEN 35
uint64_t offset;
}
faidx1_t;
-KHASH_MAP_INIT_STR(s, faidx1_t)
-KHASH_MAP_INIT_STR(str, int)
+KHASH_MAP_INIT_STR(kh_faidx, faidx1_t)
+KHASH_MAP_INIT_STR(kh_bam_tid, int)
+KHASH_MAP_INIT_STR(kh_rg, const char *)
struct __faidx_t {
RAZF *rz;
int n, m;
char **name;
- khash_t(s) *hash;
+ khash_t(kh_faidx) *hash;
};
typedef struct
{
// Parameters
int trim_qual; // bwa trim quality
- int rmdup; // Exclude reads marked as duplicates from the stats
// Dimensions of the quality histogram holder (quals_1st,quals_2nd), GC content holder (gc_1st,gc_2nd),
// insert size histogram holder
uint64_t *acgt_cycles;
uint64_t *read_lengths;
uint64_t *insertions, *deletions;
- uint64_t *ins_cycles, *del_cycles;
+ uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd;
// The extremes encountered
int max_len; // Maximum read length
regions_t *regions;
// Auxiliary data
- double sum_qual; // For calculating average quality value
- samfile_t *sam; // Unused
- faidx_t *fai; // Reference sequence for GC-depth graph
- int argc; // Command line arguments to be printed on the output
+ int flag_require, flag_filter;
+ double sum_qual; // For calculating average quality value
+ samfile_t *sam;
+ khash_t(kh_rg) *rg_hash; // Read groups to include, the array is null-terminated
+ faidx_t *fai; // Reference sequence for GC-depth graph
+ int argc; // Command line arguments to be printed on the output
char **argv;
}
stats_t;
void count_indels(stats_t *stats,bam1_t *bam_line)
{
int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
+ int is_1st = IS_READ1(bam_line) ? 1 : 0;
int icig;
int icycle = 0;
int read_len = bam_line->core.l_qseq;
int idx = is_fwd ? icycle : read_len-icycle;
if ( idx<0 ) error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle);
if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d\n", idx,stats->nbases);
- stats->ins_cycles[idx]++;
+ if ( is_1st )
+ stats->ins_cycles_1st[idx]++;
+ else
+ stats->ins_cycles_2nd[idx]++;
icycle += ncig;
if ( ncig<=stats->nindels )
stats->insertions[ncig-1]++;
int idx = is_fwd ? icycle-1 : read_len-icycle-1;
if ( idx<0 ) continue; // discard meaningless deletions
if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases);
- stats->del_cycles[idx]++;
+ if ( is_1st )
+ stats->del_cycles_1st[idx]++;
+ else
+ stats->del_cycles_2nd[idx]++;
if ( ncig<=stats->nindels )
stats->deletions[ncig-1]++;
continue;
void read_ref_seq(stats_t *stats,int32_t tid,int32_t pos)
{
- khash_t(s) *h;
+ khash_t(kh_faidx) *h;
khiter_t iter;
faidx1_t val;
char *chr, c;
chr = stats->sam->header->target_name[tid];
// ID of the sequence name
- iter = kh_get(s, h, chr);
+ iter = kh_get(kh_faidx, h, chr);
if (iter == kh_end(h))
error("No such reference sequence [%s]?\n", chr);
val = kh_value(h, iter);
error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
memset(stats->deletions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
- stats->ins_cycles = realloc(stats->ins_cycles, (n+1)*sizeof(uint64_t));
- if ( !stats->ins_cycles )
+ stats->ins_cycles_1st = realloc(stats->ins_cycles_1st, (n+1)*sizeof(uint64_t));
+ if ( !stats->ins_cycles_1st )
error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
- memset(stats->ins_cycles + stats->nbases + 1, 0, (n+1-stats->nbases)*sizeof(uint64_t));
+ memset(stats->ins_cycles_1st + stats->nbases + 1, 0, (n+1-stats->nbases)*sizeof(uint64_t));
- stats->del_cycles = realloc(stats->del_cycles, (n+1)*sizeof(uint64_t));
- if ( !stats->del_cycles )
+ stats->ins_cycles_2nd = realloc(stats->ins_cycles_2nd, (n+1)*sizeof(uint64_t));
+ if ( !stats->ins_cycles_2nd )
error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
- memset(stats->del_cycles + stats->nbases + 1, 0, (n+1-stats->nbases)*sizeof(uint64_t));
+ memset(stats->ins_cycles_2nd + stats->nbases + 1, 0, (n+1-stats->nbases)*sizeof(uint64_t));
+
+ stats->del_cycles_1st = realloc(stats->del_cycles_1st, (n+1)*sizeof(uint64_t));
+ if ( !stats->del_cycles_1st )
+ error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
+ memset(stats->del_cycles_1st + stats->nbases + 1, 0, (n+1-stats->nbases)*sizeof(uint64_t));
+
+ stats->del_cycles_2nd = realloc(stats->del_cycles_2nd, (n+1)*sizeof(uint64_t));
+ if ( !stats->del_cycles_2nd )
+ error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
+ memset(stats->del_cycles_2nd + stats->nbases + 1, 0, (n+1-stats->nbases)*sizeof(uint64_t));
stats->nbases = n;
void collect_stats(bam1_t *bam_line, stats_t *stats)
{
- if ( stats->rmdup && IS_DUP(bam_line) )
+ if ( stats->rg_hash )
+ {
+ const uint8_t *rg = bam_aux_get(bam_line, "RG");
+ if ( !rg ) return;
+ khiter_t k = kh_get(kh_rg, stats->rg_hash, (const char*)(rg + 1));
+ if ( k == kh_end(stats->rg_hash) ) return;
+ }
+ if ( stats->flag_require && (bam_line->core.flag & stats->flag_require)!=stats->flag_require )
+ return;
+ if ( stats->flag_filter && (bam_line->core.flag & stats->flag_filter) )
return;
if ( !is_in_regions(bam_line,stats) )
printf("ID\t%d\t%ld\t%ld\n", ilen+1, (long)stats->insertions[ilen], (long)stats->deletions[ilen]);
}
- printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions, number of deletions\n");
+ printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev)\n");
for (ilen=0; ilen<=stats->nbases; ilen++)
{
- if ( stats->ins_cycles[ilen]>0 || stats->del_cycles[ilen]>0 )
- printf("IC\t%d\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles[ilen], (long)stats->del_cycles[ilen]);
+ if ( stats->ins_cycles_1st[ilen]>0 || stats->ins_cycles_2nd[ilen]>0 || stats->del_cycles_1st[ilen]>0 || stats->del_cycles_2nd[ilen]>0 )
+ printf("IC\t%d\t%ld\t%ld\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles_1st[ilen], (long)stats->ins_cycles_2nd[ilen], (long)stats->del_cycles_1st[ilen], (long)stats->del_cycles_2nd[ilen]);
}
printf("# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n");
void init_regions(stats_t *stats, char *file)
{
khiter_t iter;
- khash_t(str) *header_hash;
+ khash_t(kh_bam_tid) *header_hash;
bam_init_header_hash(stats->sam->header);
- header_hash = (khash_t(str)*)stats->sam->header->hash;
+ header_hash = (khash_t(kh_bam_tid)*)stats->sam->header->hash;
FILE *fp = fopen(file,"r");
if ( !fp ) error("%s: %s\n",file,strerror(errno));
if ( i>=nread ) error("Could not parse the file: %s [%s]\n", file,line);
line[i] = 0;
- iter = kh_get(str, header_hash, line);
+ iter = kh_get(kh_bam_tid, header_hash, line);
int tid = kh_val(header_hash, iter);
if ( iter == kh_end(header_hash) )
{
return 1;
}
+void init_group_id(stats_t *stats, char *id)
+{
+ if ( !stats->sam->header->dict )
+ stats->sam->header->dict = sam_header_parse2(stats->sam->header->text);
+ void *iter = stats->sam->header->dict;
+ const char *key, *val;
+ int n = 0;
+ stats->rg_hash = kh_init(kh_rg);
+ while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) )
+ {
+ if ( !strcmp(id,key) || (val && !strcmp(id,val)) )
+ {
+ khiter_t k = kh_get(kh_rg, stats->rg_hash, key);
+ if ( k != kh_end(stats->rg_hash) )
+ fprintf(stderr, "[init_group_id] The group ID not unique: \"%s\"\n", key);
+ int ret;
+ k = kh_put(kh_rg, stats->rg_hash, key, &ret);
+ kh_value(stats->rg_hash, k) = val;
+ n++;
+ }
+ }
+ if ( !n )
+ error("The sample or read group \"%s\" not present.\n", id);
+}
+
+
void error(const char *format, ...)
{
if ( !format )
printf("Options:\n");
printf(" -c, --coverage <int>,<int>,<int> Coverage distribution min,max,step [1,1000,1]\n");
printf(" -d, --remove-dups Exlude from statistics reads marked as duplicates\n");
+ printf(" -f, --required-flag <int> Required flag, 0 for unset [0]\n");
+ printf(" -F, --filtering-flag <int> Filtering flag, 0 for unset [0]\n");
printf(" -h, --help This help message\n");
printf(" -i, --insert-size <int> Maximum insert size [8000]\n");
+ printf(" -I, --id <string> Include only listed read group or sample name\n");
printf(" -l, --read-length <int> Include in the statistics only reads with the given read length []\n");
printf(" -m, --most-inserts <float> Report only the main part of inserts [0.99]\n");
printf(" -q, --trim-quality <int> The BWA trimming parameter [0]\n");
{
char *targets = NULL;
char *bam_fname = NULL;
+ char *group_id = NULL;
samfile_t *sam = NULL;
char in_mode[5];
{"most-inserts",0,0,'m'},
{"trim-quality",0,0,'q'},
{"target-regions",0,0,'t'},
+ {"required-flag",0,0,'f'},
+ {"filtering-flag",0,0,'F'},
+ {"id",0,0,'I'},
{0,0,0,0}
};
int opt;
- while ( (opt=getopt_long(argc,argv,"?hdsr:c:l:i:t:m:q:",loptions,NULL))>0 )
+ while ( (opt=getopt_long(argc,argv,"?hdsr:c:l:i:t:m:q:f:F:I:",loptions,NULL))>0 )
{
switch (opt)
{
- case 'd': stats->rmdup=1; break;
+ case 'f': stats->flag_require=strtol(optarg,0,0); break;
+ case 'F': stats->flag_filter=strtol(optarg,0,0); break;
+ case 'd': stats->flag_filter|=BAM_FDUP; break;
case 's': strcpy(in_mode, "r"); break;
case 'r': stats->fai = fai_load(optarg);
if (stats->fai==0)
case 'm': stats->isize_main_bulk = atof(optarg); break;
case 'q': stats->trim_qual = atoi(optarg); break;
case 't': targets = optarg; break;
+ case 'I': group_id = optarg; break;
case '?':
case 'h': error(NULL);
default: error("Unknown argument: %s\n", optarg);
if ((sam = samopen(bam_fname, in_mode, NULL)) == 0)
error("Failed to open: %s\n", bam_fname);
stats->sam = sam;
+ if ( group_id ) init_group_id(stats, group_id);
bam1_t *bam_line = bam_init1();
// .. arrays
- stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
- stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
- stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t));
- stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t));
- stats->isize_inward = calloc(stats->nisize,sizeof(uint64_t));
- stats->isize_outward = calloc(stats->nisize,sizeof(uint64_t));
- stats->isize_other = calloc(stats->nisize,sizeof(uint64_t));
- stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t));
- stats->rseq_buf = calloc(stats->nref_seq,sizeof(uint8_t));
- stats->mpc_buf = stats->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL;
- stats->acgt_cycles = calloc(4*stats->nbases,sizeof(uint64_t));
- stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t));
- stats->insertions = calloc(stats->nbases,sizeof(uint64_t));
- stats->deletions = calloc(stats->nbases,sizeof(uint64_t));
- stats->ins_cycles = calloc(stats->nbases+1,sizeof(uint64_t));
- stats->del_cycles = calloc(stats->nbases+1,sizeof(uint64_t));
+ stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
+ stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
+ stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t));
+ stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t));
+ stats->isize_inward = calloc(stats->nisize,sizeof(uint64_t));
+ stats->isize_outward = calloc(stats->nisize,sizeof(uint64_t));
+ stats->isize_other = calloc(stats->nisize,sizeof(uint64_t));
+ stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t));
+ stats->rseq_buf = calloc(stats->nref_seq,sizeof(uint8_t));
+ stats->mpc_buf = stats->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL;
+ stats->acgt_cycles = calloc(4*stats->nbases,sizeof(uint64_t));
+ stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t));
+ stats->insertions = calloc(stats->nbases,sizeof(uint64_t));
+ stats->deletions = calloc(stats->nbases,sizeof(uint64_t));
+ stats->ins_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
+ stats->ins_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
+ stats->del_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
+ stats->del_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
if ( targets )
init_regions(stats, targets);
free(stats->read_lengths);
free(stats->insertions);
free(stats->deletions);
- free(stats->ins_cycles);
- free(stats->del_cycles);
+ free(stats->ins_cycles_1st);
+ free(stats->ins_cycles_2nd);
+ free(stats->del_cycles_1st);
+ free(stats->del_cycles_2nd);
destroy_regions(stats);
free(stats);
+ if ( stats->rg_hash ) kh_destroy(kh_rg, stats->rg_hash);
return 0;
}