From ec8ca3b21511355c3770767be5a7c9b54e218a53 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 5 Jul 2012 18:30:12 +0100 Subject: [PATCH] Allow filtering by read group or sample name in bamcheck. Bug fix in rm_info. --- bam_tview.c | 40 +++++++++++++++++++++------- bcftools/bcf.c | 2 +- bcftools/prob1.c | 2 +- misc/bamcheck.c | 68 +++++++++++++++++++++++++++++++++++++++--------- sam_header.c | 30 +++++++++++++++++++++ sam_header.h | 7 +++++ 6 files changed, 124 insertions(+), 25 deletions(-) diff --git a/bam_tview.c b/bam_tview.c index 1967b7c..f8a1f2c 100644 --- a/bam_tview.c +++ b/bam_tview.c @@ -25,6 +25,9 @@ #include "faidx.h" #include "bam2bcf.h" #include "sam_header.h" +#include "khash.h" + +KHASH_MAP_INIT_STR(kh_rg, const char *) char bam_aux_getCEi(bam1_t *b, int i); char bam_aux_getCSi(bam1_t *b, int i); @@ -57,8 +60,7 @@ typedef struct { int ccol, last_pos, row_shift, base_for, color_for, is_dot, l_ref, ins, no_skip, show_name; char *ref; - char *sample; //TODO: multiple samples and read groups - void *rg2sm; + khash_t(kh_rg) *rg_hash; } tview_t; int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) @@ -216,9 +218,28 @@ tview_t *tv_init(const char *fn, const char *fn_fa, char *samples) if ( samples ) { - tv->sample = samples; - tv->header->dict = sam_header_parse2(tv->header->text); - tv->rg2sm = sam_header2tbl(tv->header->dict, "RG", "ID", "SM"); + if ( !tv->header->dict ) tv->header->dict = sam_header_parse2(tv->header->text); + void *iter = tv->header->dict; + const char *key, *val; + int n = 0; + tv->rg_hash = kh_init(kh_rg); + while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) ) + { + if ( !strcmp(samples,key) || (val && !strcmp(samples,val)) ) + { + khiter_t k = kh_get(kh_rg, tv->rg_hash, key); + if ( k != kh_end(tv->rg_hash) ) continue; + int ret; + k = kh_put(kh_rg, tv->rg_hash, key, &ret); + kh_value(tv->rg_hash, k) = val; + n++; + } + } + if ( !n ) + { + fprintf(stderr,"The sample or read group \"%s\" not present.\n", samples); + exit(-1); + } } initscr(); @@ -262,13 +283,12 @@ void tv_destroy(tview_t *tv) int tv_fetch_func(const bam1_t *b, void *data) { tview_t *tv = (tview_t*)data; - if ( tv->sample ) + if ( tv->rg_hash ) { const uint8_t *rg = bam_aux_get(b, "RG"); if ( !rg ) return 0; - const char *sm = sam_tbl_get(tv->rg2sm, (const char*)(rg + 1)); - if ( !sm ) return 0; - if ( strcmp(sm,tv->sample) ) return 0; + khiter_t k = kh_get(kh_rg, tv->rg_hash, (const char*)(rg + 1)); + if ( k == kh_end(tv->rg_hash) ) return 0; } if (tv->no_skip) { uint32_t *cigar = bam1_cigar(b); // this is cheating... @@ -442,7 +462,7 @@ void error(const char *format, ...) fprintf(stderr, "Usage: bamtk tview [options] [ref.fasta]\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, " -p chr:pos go directly to this position\n"); - fprintf(stderr, " -s STR display only reads from this sample\n"); + fprintf(stderr, " -s STR display only reads from this sample or grou\n"); fprintf(stderr, "\n\n"); } else diff --git a/bcftools/bcf.c b/bcftools/bcf.c index 6695c74..c705998 100644 --- a/bcftools/bcf.c +++ b/bcftools/bcf.c @@ -340,7 +340,7 @@ int remove_tag(char *str, const char *tag, char delim) len_diff += q-p; if ( ! *q ) { *p = 0; break; } // the tag was last, no delim follows else - memmove(p,q,ori_len-(int)(p-str)-(int)(q-p)-1); // *q==delim + memmove(p,q,ori_len-(int)(p-str)-(int)(q-p)); // *q==delim } if ( len_diff==ori_len ) str[0]='.', str[1]=0, len_diff--; diff --git a/bcftools/prob1.c b/bcftools/prob1.c index 1289437..0af5955 100644 --- a/bcftools/prob1.c +++ b/bcftools/prob1.c @@ -447,9 +447,9 @@ int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold) if ( a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]); ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq); } + kputc('\0', &s); rm_info(&s, "I16="); rm_info(&s, "QS="); - kputc('\0', &s); } kputs(b->fmt, &s); kputc('\0', &s); free(b->str); diff --git a/misc/bamcheck.c b/misc/bamcheck.c index 26f7c0a..8c54388 100644 --- a/misc/bamcheck.c +++ b/misc/bamcheck.c @@ -29,6 +29,7 @@ #include "faidx.h" #include "khash.h" #include "sam.h" +#include "sam_header.h" #include "razf.h" #define BWA_MIN_RDLEN 35 @@ -47,13 +48,14 @@ typedef struct uint64_t offset; } faidx1_t; -KHASH_MAP_INIT_STR(s, faidx1_t) -KHASH_MAP_INIT_STR(str, int) +KHASH_MAP_INIT_STR(kh_faidx, faidx1_t) +KHASH_MAP_INIT_STR(kh_bam_tid, int) +KHASH_MAP_INIT_STR(kh_rg, const char *) struct __faidx_t { RAZF *rz; int n, m; char **name; - khash_t(s) *hash; + khash_t(kh_faidx) *hash; }; typedef struct @@ -152,10 +154,11 @@ typedef struct // Auxiliary data int flag_require, flag_filter; - double sum_qual; // For calculating average quality value - samfile_t *sam; // Unused - faidx_t *fai; // Reference sequence for GC-depth graph - int argc; // Command line arguments to be printed on the output + double sum_qual; // For calculating average quality value + samfile_t *sam; + khash_t(kh_rg) *rg_hash; // Read groups to include, the array is null-terminated + faidx_t *fai; // Reference sequence for GC-depth graph + int argc; // Command line arguments to be printed on the output char **argv; } stats_t; @@ -406,7 +409,7 @@ void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line) void read_ref_seq(stats_t *stats,int32_t tid,int32_t pos) { - khash_t(s) *h; + khash_t(kh_faidx) *h; khiter_t iter; faidx1_t val; char *chr, c; @@ -416,7 +419,7 @@ void read_ref_seq(stats_t *stats,int32_t tid,int32_t pos) chr = stats->sam->header->target_name[tid]; // ID of the sequence name - iter = kh_get(s, h, chr); + iter = kh_get(kh_faidx, h, chr); if (iter == kh_end(h)) error("No such reference sequence [%s]?\n", chr); val = kh_value(h, iter); @@ -565,6 +568,13 @@ void realloc_buffers(stats_t *stats, int seq_len) void collect_stats(bam1_t *bam_line, stats_t *stats) { + if ( stats->rg_hash ) + { + const uint8_t *rg = bam_aux_get(bam_line, "RG"); + if ( !rg ) return; + khiter_t k = kh_get(kh_rg, stats->rg_hash, (const char*)(rg + 1)); + if ( k == kh_end(stats->rg_hash) ) return; + } if ( stats->flag_require && (bam_line->core.flag & stats->flag_require)!=stats->flag_require ) return; if ( stats->flag_filter && (bam_line->core.flag & stats->flag_filter) ) @@ -1074,10 +1084,10 @@ size_t mygetline(char **line, size_t *n, FILE *fp) void init_regions(stats_t *stats, char *file) { khiter_t iter; - khash_t(str) *header_hash; + khash_t(kh_bam_tid) *header_hash; bam_init_header_hash(stats->sam->header); - header_hash = (khash_t(str)*)stats->sam->header->hash; + header_hash = (khash_t(kh_bam_tid)*)stats->sam->header->hash; FILE *fp = fopen(file,"r"); if ( !fp ) error("%s: %s\n",file,strerror(errno)); @@ -1096,7 +1106,7 @@ void init_regions(stats_t *stats, char *file) if ( i>=nread ) error("Could not parse the file: %s [%s]\n", file,line); line[i] = 0; - iter = kh_get(str, header_hash, line); + iter = kh_get(kh_bam_tid, header_hash, line); int tid = kh_val(header_hash, iter); if ( iter == kh_end(header_hash) ) { @@ -1186,6 +1196,32 @@ int is_in_regions(bam1_t *bam_line, stats_t *stats) return 1; } +void init_group_id(stats_t *stats, char *id) +{ + if ( !stats->sam->header->dict ) + stats->sam->header->dict = sam_header_parse2(stats->sam->header->text); + void *iter = stats->sam->header->dict; + const char *key, *val; + int n = 0; + stats->rg_hash = kh_init(kh_rg); + while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) ) + { + if ( !strcmp(id,key) || (val && !strcmp(id,val)) ) + { + khiter_t k = kh_get(kh_rg, stats->rg_hash, key); + if ( k != kh_end(stats->rg_hash) ) + fprintf(stderr, "[init_group_id] The group ID not unique: \"%s\"\n", key); + int ret; + k = kh_put(kh_rg, stats->rg_hash, key, &ret); + kh_value(stats->rg_hash, k) = val; + n++; + } + } + if ( !n ) + error("The sample or read group \"%s\" not present.\n", id); +} + + void error(const char *format, ...) { if ( !format ) @@ -1201,6 +1237,7 @@ void error(const char *format, ...) printf(" -F, --filtering-flag Filtering flag, 0 for unset [0]\n"); printf(" -h, --help This help message\n"); printf(" -i, --insert-size Maximum insert size [8000]\n"); + printf(" -I, --id Include only listed read group or sample name\n"); printf(" -l, --read-length Include in the statistics only reads with the given read length []\n"); printf(" -m, --most-inserts Report only the main part of inserts [0.99]\n"); printf(" -q, --trim-quality The BWA trimming parameter [0]\n"); @@ -1223,6 +1260,7 @@ int main(int argc, char *argv[]) { char *targets = NULL; char *bam_fname = NULL; + char *group_id = NULL; samfile_t *sam = NULL; char in_mode[5]; @@ -1264,10 +1302,11 @@ int main(int argc, char *argv[]) {"target-regions",0,0,'t'}, {"required-flag",0,0,'f'}, {"filtering-flag",0,0,'F'}, + {"id",0,0,'I'}, {0,0,0,0} }; int opt; - while ( (opt=getopt_long(argc,argv,"?hdsr:c:l:i:t:m:q:f:F:",loptions,NULL))>0 ) + while ( (opt=getopt_long(argc,argv,"?hdsr:c:l:i:t:m:q:f:F:I:",loptions,NULL))>0 ) { switch (opt) { @@ -1287,6 +1326,7 @@ int main(int argc, char *argv[]) case 'm': stats->isize_main_bulk = atof(optarg); break; case 'q': stats->trim_qual = atoi(optarg); break; case 't': targets = optarg; break; + case 'I': group_id = optarg; break; case '?': case 'h': error(NULL); default: error("Unknown argument: %s\n", optarg); @@ -1319,6 +1359,7 @@ int main(int argc, char *argv[]) if ((sam = samopen(bam_fname, in_mode, NULL)) == 0) error("Failed to open: %s\n", bam_fname); stats->sam = sam; + if ( group_id ) init_group_id(stats, group_id); bam1_t *bam_line = bam_init1(); // .. arrays stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t)); @@ -1391,6 +1432,7 @@ int main(int argc, char *argv[]) free(stats->del_cycles_2nd); destroy_regions(stats); free(stats); + if ( stats->rg_hash ) kh_destroy(kh_rg, stats->rg_hash); return 0; } diff --git a/sam_header.c b/sam_header.c index d348d10..a1b5181 100644 --- a/sam_header.c +++ b/sam_header.c @@ -669,6 +669,36 @@ char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n return ret; } +void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], const char value_tag[2], const char **_key, const char **_value) +{ + list_t *l = iter; + if ( !l ) return NULL; + + while (l) + { + HeaderLine *hline = l->data; + if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) + { + l = l->next; + continue; + } + + HeaderTag *key, *value; + key = header_line_has_tag(hline,key_tag); + value = header_line_has_tag(hline,value_tag); + if ( !key && !value ) + { + l = l->next; + continue; + } + + *_key = key->value; + *_value = value->value; + return l->next; + } + return l; +} + const char *sam_tbl_get(void *h, const char *key) { khash_t(str) *tbl = (khash_t(str)*)h; diff --git a/sam_header.h b/sam_header.h index e5c754f..ebea12f 100644 --- a/sam_header.h +++ b/sam_header.h @@ -10,6 +10,13 @@ extern "C" { void sam_header_free(void *header); char *sam_header_write(const void *headerDict); // returns a newly allocated string + /* + // Usage example + const char *key, *val; + void *iter = sam_header_parse2(bam->header->text); + while ( iter = sam_header_key_val(iter, "RG","ID","SM" &key,&val) ) printf("%s\t%s\n", key,val); + */ + void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], const char value_tag[2], const char **key, const char **value); char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n); void *sam_header2tbl(const void *dict, char type[2], char key_tag[2], char value_tag[2]); -- 2.39.2