From: Petr Danecek Date: Thu, 13 Dec 2012 07:49:32 +0000 (+0000) Subject: Merge branch 'vsbuffalo-master' into develop X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=3a1bd4d97b4d58148b5a7fd845a3b6a023eecbed;hp=7ff31092e55f28de410cd88957e731e88f410741;p=samtools.git Merge branch 'vsbuffalo-master' into develop --- diff --git a/bam2depth.c b/bam2depth.c index 87a4c5b..02311ef 100644 --- a/bam2depth.c +++ b/bam2depth.c @@ -32,37 +32,58 @@ static int read_bam(void *data, bam1_t *b) // read level filters better go here return ret; } +int read_file_list(const char *file_list,int *n,char **argv[]); + #ifdef _MAIN_BAM2DEPTH int main(int argc, char *argv[]) #else int main_depth(int argc, char *argv[]) #endif { - int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0; + int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0, nfiles; const bam_pileup1_t **plp; char *reg = 0; // specified region void *bed = 0; // BED data structure + char *file_list = NULL, **fn = NULL; bam_header_t *h = 0; // BAM header of the 1st input aux_t **data; bam_mplp_t mplp; // parse the command line - while ((n = getopt(argc, argv, "r:b:q:Q:l:")) >= 0) { + while ((n = getopt(argc, argv, "r:b:q:Q:l:f:")) >= 0) { switch (n) { case 'l': min_len = atoi(optarg); break; // minimum query length case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header case 'b': bed = bed_read(optarg); break; // BED or position list file can be parsed now case 'q': baseQ = atoi(optarg); break; // base quality threshold case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold + case 'f': file_list = optarg; break; } } - if (optind == argc) { - fprintf(stderr, "Usage: bam2depth [-r reg] [-q baseQthres] [-Q mapQthres] [-l minQLen] [-b in.bed] [...]\n"); + if (optind == argc && !file_list) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n"); + fprintf(stderr, "Options:\n"); + fprintf(stderr, " -b list of positions or regions\n"); + fprintf(stderr, " -f list of input BAM filenames, one per line [null]\n"); + fprintf(stderr, " -l minQLen\n"); + fprintf(stderr, " -q base quality threshold\n"); + fprintf(stderr, " -Q mapping quality threshold\n"); + fprintf(stderr, " -r region\n"); + fprintf(stderr, "\n"); return 1; } // initialize the auxiliary data structures - n = argc - optind; // the number of BAMs on the command line + if (file_list) + { + if ( read_file_list(file_list,&nfiles,&fn) ) return 1; + n = nfiles; + argv = fn; + optind = 0; + } + else + n = argc - optind; // the number of BAMs on the command line data = calloc(n, sizeof(void*)); // data[i] for the i-th input beg = 0; end = 1<<30; tid = -1; // set the default region for (i = 0; i < n; ++i) { @@ -113,5 +134,10 @@ int main_depth(int argc, char *argv[]) } free(data); free(reg); if (bed) bed_destroy(bed); + if ( file_list ) + { + for (i=0; i #include #include +#include +#include #include "sam.h" #include "faidx.h" #include "kstring.h" @@ -80,6 +82,7 @@ int bed_overlap(const void *_h, const char *chr, int beg, int end); typedef struct { int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag; + int rflag_require, rflag_filter; int openQ, extQ, tandemQ, min_support; // for indels double min_frac; // for indels char *reg, *pl_list; @@ -117,6 +120,8 @@ static int mplp_func(void *data, bam1_t *b) skip = 1; continue; } + if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) { skip = 1; continue; } + if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) { skip = 1; continue; } if (ma->conf->bed) { // test overlap skip = !bed_overlap(ma->conf->bed, ma->h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b))); if (skip) continue; @@ -210,7 +215,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r"); if ( !data[i]->fp ) { - fprintf(stderr, "[%s] failed to open %d-th input.\n", __func__, i+1); + fprintf(stderr, "[%s] failed to open %s: %s\n", __func__, fn[i], strerror(errno)); exit(1); } data[i]->conf = conf; @@ -223,11 +228,11 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) bam_index_t *idx; idx = bam_index_load(fn[i]); if (idx == 0) { - fprintf(stderr, "[%s] fail to load index for %d-th input.\n", __func__, i+1); + fprintf(stderr, "[%s] fail to load index for %s\n", __func__, fn[i]); exit(1); } if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) { - fprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\n", __func__, i+1); + fprintf(stderr, "[%s] malformatted region or wrong seqname for %s\n", __func__, fn[i]); exit(1); } if (i == 0) tid0 = tid, beg0 = beg, end0 = end; @@ -393,11 +398,15 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) } #define MAX_PATH_LEN 1024 -static int read_file_list(const char *file_list,int *n,char **argv[]) +int read_file_list(const char *file_list,int *n,char **argv[]) { char buf[MAX_PATH_LEN]; - int len, nfiles; - char **files; + int len, nfiles = 0; + char **files = NULL; + struct stat sb; + + *n = 0; + *argv = NULL; FILE *fh = fopen(file_list,"r"); if ( !fh ) @@ -406,28 +415,33 @@ static int read_file_list(const char *file_list,int *n,char **argv[]) return 1; } - // Speed is not an issue here, determine the number of files by reading the file twice - nfiles = 0; - while ( fgets(buf,MAX_PATH_LEN,fh) ) nfiles++; - - if ( fseek(fh, 0L, SEEK_SET) ) - { - fprintf(stderr,"%s: %s\n", file_list,strerror(errno)); - return 1; - } - files = calloc(nfiles,sizeof(char*)); nfiles = 0; while ( fgets(buf,MAX_PATH_LEN,fh) ) { + // allow empty lines and trailing spaces len = strlen(buf); while ( len>0 && isspace(buf[len-1]) ) len--; if ( !len ) continue; - files[nfiles] = malloc(sizeof(char)*(len+1)); - strncpy(files[nfiles],buf,len); - files[nfiles][len] = 0; + // check sanity of the file list + buf[len] = 0; + if (stat(buf, &sb) != 0) + { + // no such file, check if it is safe to print its name + int i, safe_to_print = 1; + for (i=0; i= 0) { + static struct option lopts[] = + { + {"rf",1,0,1}, // require flag + {"ff",1,0,2}, // filter flag + {0,0,0,0} + }; + while ((c = getopt_long(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsV1:2:",lopts,NULL)) >= 0) { switch (c) { + case 1 : mplp.rflag_require = strtol(optarg,0,0); break; + case 2 : mplp.rflag_filter = strtol(optarg,0,0); break; case 'f': mplp.fai = fai_load(optarg); if (mplp.fai == 0) return 1; @@ -513,7 +535,7 @@ int bam_mpileup(int argc, char *argv[]) fprintf(stderr, " -6 assume the quality is in the Illumina-1.3+ encoding\n"); fprintf(stderr, " -A count anomalous read pairs\n"); fprintf(stderr, " -B disable BAQ computation\n"); - fprintf(stderr, " -b FILE list of input BAM files [null]\n"); + fprintf(stderr, " -b FILE list of input BAM filenames, one per line [null]\n"); fprintf(stderr, " -C INT parameter for adjusting mapQ; 0 to disable [0]\n"); fprintf(stderr, " -d INT max per-BAM depth to avoid excessive memory usage [%d]\n", mplp.max_depth); fprintf(stderr, " -E recalculate extended BAQ on the fly thus ignoring existing BQs\n"); @@ -525,6 +547,8 @@ int bam_mpileup(int argc, char *argv[]) fprintf(stderr, " -R ignore RG tags\n"); fprintf(stderr, " -q INT skip alignments with mapQ smaller than INT [%d]\n", mplp.min_mq); fprintf(stderr, " -Q INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp.min_baseQ); + fprintf(stderr, " --rf INT required flags: skip reads with mask bits unset []\n"); + fprintf(stderr, " --ff INT filter flags: skip reads with mask bits set []\n"); fprintf(stderr, "\nOutput options:\n\n"); fprintf(stderr, " -D output per-sample DP in BCF (require -g/-u)\n"); fprintf(stderr, " -g generate BCF output (genotype likelihoods)\n"); diff --git a/ksort.h b/ksort.h index fa850ab..aa0bb93 100644 --- a/ksort.h +++ b/ksort.h @@ -26,6 +26,10 @@ /* Contact: Heng Li */ /* + 2012-12-11 (0.1.4): + + * Defined __ks_insertsort_##name as static to compile with C99. + 2008-11-16 (0.1.4): * Fixed a bug in introsort() that happens in rare cases. @@ -141,7 +145,7 @@ typedef struct { tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \ } \ } \ - inline void __ks_insertsort_##name(type_t *s, type_t *t) \ + static inline void __ks_insertsort_##name(type_t *s, type_t *t) \ { \ type_t *i, *j, swap_tmp; \ for (i = s + 1; i < t; ++i) \ diff --git a/misc/Makefile b/misc/Makefile index e1a5add..d36e7ac 100644 --- a/misc/Makefile +++ b/misc/Makefile @@ -28,7 +28,7 @@ lib-recur all-recur clean-recur cleanlocal-recur install-recur: lib: bamcheck:bamcheck.o - $(CC) $(CFLAGS) -o $@ bamcheck.o -lm -lz -L.. -lbam -lpthread + $(CC) $(CFLAGS) -o $@ bamcheck.o -L.. -lm -lbam -lpthread -lz bamcheck.o:bamcheck.c ../faidx.h ../khash.h ../sam.h ../razf.h $(CC) $(CFLAGS) -c -I.. -o $@ bamcheck.c diff --git a/misc/bamcheck.c b/misc/bamcheck.c index d72e800..5ab85b7 100644 --- a/misc/bamcheck.c +++ b/misc/bamcheck.c @@ -120,11 +120,13 @@ typedef struct uint64_t nreads_unmapped; uint64_t nreads_unpaired; uint64_t nreads_paired; + uint64_t nreads_anomalous; uint64_t nreads_mq0; uint64_t nbases_mapped; uint64_t nbases_mapped_cigar; uint64_t nbases_trimmed; // bwa trimmed bases uint64_t nmismatches; + uint64_t nreads_QCfailed, nreads_secondary; // GC-depth related data uint32_t ngcd, igcd; // The maximum number of GC depth bins and index of the current bin @@ -395,7 +397,7 @@ void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line) { uint8_t qual = quals[iread] + 1; if ( qual>=stats->nquals ) - error("TODO: quality too high %d>=%d\n", quals[iread],stats->nquals); + error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line)); int idx = is_fwd ? icycle : read_len-icycle-1; if ( idx>stats->max_len ) @@ -620,13 +622,17 @@ void collect_stats(bam1_t *bam_line, stats_t *stats) return; if ( stats->flag_filter && (bam_line->core.flag & stats->flag_filter) ) return; - if ( !is_in_regions(bam_line,stats) ) return; + if ( stats->filter_readlen!=-1 && bam_line->core.l_qseq!=stats->filter_readlen ) + return; + + if ( bam_line->core.flag & BAM_FQCFAIL ) stats->nreads_QCfailed++; + if ( bam_line->core.flag & BAM_FSECONDARY ) stats->nreads_secondary++; int seq_len = bam_line->core.l_qseq; if ( !seq_len ) return; - if ( stats->filter_readlen!=-1 && seq_len!=stats->filter_readlen ) return; + if ( seq_len >= stats->nbases ) realloc_buffers(stats,seq_len); if ( stats->max_len=stats->nquals ) - error("TODO: quality too high %d>=%d\n", quals[i],stats->nquals); + error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line)); if ( qual>stats->max_qual ) stats->max_qual = qual; @@ -702,42 +708,48 @@ void collect_stats(bam1_t *bam_line, stats_t *stats) count_indels(stats,bam_line); - // The insert size is tricky, because for long inserts the libraries are - // prepared differently and the pairs point in other direction. BWA does - // not set the paired flag for them. Similar thing is true also for 454 - // reads. Therefore, do the insert size stats for all mapped reads. - int32_t isize = bam_line->core.isize; - if ( isize<0 ) isize = -isize; - if ( IS_PAIRED(bam_line) && isize!=0 ) + if ( !IS_PAIRED(bam_line) ) + stats->nreads_unpaired++; + else { stats->nreads_paired++; - if ( isize >= stats->nisize ) - isize=stats->nisize-1; - int pos_fst = bam_line->core.mpos - bam_line->core.pos; - int is_fst = IS_READ1(bam_line) ? 1 : -1; - int is_fwd = IS_REVERSE(bam_line) ? -1 : 1; - int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1; + if ( bam_line->core.tid!=bam_line->core.mtid ) + stats->nreads_anomalous++; - if ( is_fwd*is_mfwd>0 ) - stats->isize_other[isize]++; - else if ( is_fst*pos_fst>0 ) - { - if ( is_fst*is_fwd>0 ) - stats->isize_inward[isize]++; - else - stats->isize_outward[isize]++; - } - else if ( is_fst*pos_fst<0 ) + // The insert size is tricky, because for long inserts the libraries are + // prepared differently and the pairs point in other direction. BWA does + // not set the paired flag for them. Similar thing is true also for 454 + // reads. Mates mapped to different chromosomes have isize==0. + int32_t isize = bam_line->core.isize; + if ( isize<0 ) isize = -isize; + if ( isize >= stats->nisize ) + isize = stats->nisize-1; + if ( isize>0 || bam_line->core.tid==bam_line->core.mtid ) { - if ( is_fst*is_fwd>0 ) - stats->isize_outward[isize]++; - else - stats->isize_inward[isize]++; + int pos_fst = bam_line->core.mpos - bam_line->core.pos; + int is_fst = IS_READ1(bam_line) ? 1 : -1; + int is_fwd = IS_REVERSE(bam_line) ? -1 : 1; + int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1; + + if ( is_fwd*is_mfwd>0 ) + stats->isize_other[isize]++; + else if ( is_fst*pos_fst>0 ) + { + if ( is_fst*is_fwd>0 ) + stats->isize_inward[isize]++; + else + stats->isize_outward[isize]++; + } + else if ( is_fst*pos_fst<0 ) + { + if ( is_fst*is_fwd>0 ) + stats->isize_outward[isize]++; + else + stats->isize_inward[isize]++; + } } } - else - stats->nreads_unpaired++; // Number of mismatches uint8_t *nm = bam_aux_get(bam_line,"NM"); @@ -889,7 +901,7 @@ void output_stats(stats_t *stats) // Calculate average insert size and standard deviation (from the main bulk data only) int isize, ibulk=0; uint64_t nisize=0, nisize_inward=0, nisize_outward=0, nisize_other=0; - for (isize=1; isizenisize; isize++) + for (isize=0; isizenisize; isize++) { // Each pair was counted twice stats->isize_inward[isize] *= 0.5; @@ -903,7 +915,7 @@ void output_stats(stats_t *stats) } double bulk=0, avg_isize=0, sd_isize=0; - for (isize=1; isizenisize; isize++) + for (isize=0; isizenisize; isize++) { bulk += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]; avg_isize += isize * (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]); @@ -939,6 +951,8 @@ void output_stats(stats_t *stats) printf("SN\treads paired:\t%ld\n", (long)stats->nreads_paired); printf("SN\treads duplicated:\t%ld\n", (long)stats->nreads_dup); printf("SN\treads MQ0:\t%ld\n", (long)stats->nreads_mq0); + printf("SN\treads QC failed:\t%ld\n", (long)stats->nreads_QCfailed); + printf("SN\tnon-primary alignments:\t%ld\n", (long)stats->nreads_secondary); printf("SN\ttotal length:\t%ld\n", (long)stats->total_len); printf("SN\tbases mapped:\t%ld\n", (long)stats->nbases_mapped); printf("SN\tbases mapped (cigar):\t%ld\n", (long)stats->nbases_mapped_cigar); @@ -955,6 +969,7 @@ void output_stats(stats_t *stats) printf("SN\tinward oriented pairs:\t%ld\n", (long)nisize_inward); printf("SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward); printf("SN\tpairs with other orientation:\t%ld\n", (long)nisize_other); + printf("SN\tpairs on different chromosomes:\t%ld\n", (long)stats->nreads_anomalous/2); int ibase,iqual; if ( stats->max_lennbases ) stats->max_len++; @@ -1021,7 +1036,7 @@ void output_stats(stats_t *stats) printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum); } printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n"); - for (isize=1; isizeisize_inward[isize]+stats->isize_outward[isize]+stats->isize_other[isize]), (long)stats->isize_inward[isize], (long)stats->isize_outward[isize], (long)stats->isize_other[isize]); @@ -1313,7 +1328,7 @@ int main(int argc, char *argv[]) stats_t *stats = calloc(1,sizeof(stats_t)); stats->ngc = 200; - stats->nquals = 95; + stats->nquals = 256; stats->nbases = 300; stats->nisize = 8000; stats->max_len = 30; @@ -1322,7 +1337,8 @@ int main(int argc, char *argv[]) stats->gcd_bin_size = 20e3; stats->gcd_ref_size = 4.2e9; stats->rseq_pos = -1; - stats->tid = stats->gcd_pos = stats->igcd = -1; + stats->tid = stats->gcd_pos = -1; + stats->igcd = 0; stats->is_sorted = 1; stats->cov_min = 1; stats->cov_max = 1000;