return ret;
}
+int read_file_list(const char *file_list,int *n,char **argv[]);
+
#ifdef _MAIN_BAM2DEPTH
int main(int argc, char *argv[])
#else
int main_depth(int argc, char *argv[])
#endif
{
- int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0;
+ int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0, nfiles;
const bam_pileup1_t **plp;
char *reg = 0; // specified region
void *bed = 0; // BED data structure
+ char *file_list = NULL, **fn = NULL;
bam_header_t *h = 0; // BAM header of the 1st input
aux_t **data;
bam_mplp_t mplp;
// parse the command line
- while ((n = getopt(argc, argv, "r:b:q:Q:l:")) >= 0) {
+ while ((n = getopt(argc, argv, "r:b:q:Q:l:f:")) >= 0) {
switch (n) {
case 'l': min_len = atoi(optarg); break; // minimum query length
case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header
case 'b': bed = bed_read(optarg); break; // BED or position list file can be parsed now
case 'q': baseQ = atoi(optarg); break; // base quality threshold
case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold
+ case 'f': file_list = optarg; break;
}
}
- if (optind == argc) {
- fprintf(stderr, "Usage: bam2depth [-r reg] [-q baseQthres] [-Q mapQthres] [-l minQLen] [-b in.bed] <in1.bam> [...]\n");
+ if (optind == argc && !file_list) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n");
+ fprintf(stderr, "Options:\n");
+ fprintf(stderr, " -b <bed> list of positions or regions\n");
+ fprintf(stderr, " -f <list> list of input BAM filenames, one per line [null]\n");
+ fprintf(stderr, " -l <int> minQLen\n");
+ fprintf(stderr, " -q <int> base quality threshold\n");
+ fprintf(stderr, " -Q <int> mapping quality threshold\n");
+ fprintf(stderr, " -r <chr:from-to> region\n");
+ fprintf(stderr, "\n");
return 1;
}
// initialize the auxiliary data structures
- n = argc - optind; // the number of BAMs on the command line
+ if (file_list)
+ {
+ if ( read_file_list(file_list,&nfiles,&fn) ) return 1;
+ n = nfiles;
+ argv = fn;
+ optind = 0;
+ }
+ else
+ n = argc - optind; // the number of BAMs on the command line
data = calloc(n, sizeof(void*)); // data[i] for the i-th input
beg = 0; end = 1<<30; tid = -1; // set the default region
for (i = 0; i < n; ++i) {
}
free(data); free(reg);
if (bed) bed_destroy(bed);
+ if ( file_list )
+ {
+ for (i=0; i<n; i++) free(fn[i]);
+ free(fn);
+ }
return 0;
}
#include <ctype.h>
#include <string.h>
#include <errno.h>
+#include <sys/stat.h>
+#include <getopt.h>
#include "sam.h"
#include "faidx.h"
#include "kstring.h"
typedef struct {
int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag;
+ int rflag_require, rflag_filter;
int openQ, extQ, tandemQ, min_support; // for indels
double min_frac; // for indels
char *reg, *pl_list;
skip = 1;
continue;
}
+ if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) { skip = 1; continue; }
+ if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) { skip = 1; continue; }
if (ma->conf->bed) { // test overlap
skip = !bed_overlap(ma->conf->bed, ma->h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b)));
if (skip) continue;
data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r");
if ( !data[i]->fp )
{
- fprintf(stderr, "[%s] failed to open %d-th input.\n", __func__, i+1);
+ fprintf(stderr, "[%s] failed to open %s: %s\n", __func__, fn[i], strerror(errno));
exit(1);
}
data[i]->conf = conf;
bam_index_t *idx;
idx = bam_index_load(fn[i]);
if (idx == 0) {
- fprintf(stderr, "[%s] fail to load index for %d-th input.\n", __func__, i+1);
+ fprintf(stderr, "[%s] fail to load index for %s\n", __func__, fn[i]);
exit(1);
}
if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) {
- fprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\n", __func__, i+1);
+ fprintf(stderr, "[%s] malformatted region or wrong seqname for %s\n", __func__, fn[i]);
exit(1);
}
if (i == 0) tid0 = tid, beg0 = beg, end0 = end;
}
#define MAX_PATH_LEN 1024
-static int read_file_list(const char *file_list,int *n,char **argv[])
+int read_file_list(const char *file_list,int *n,char **argv[])
{
char buf[MAX_PATH_LEN];
- int len, nfiles;
- char **files;
+ int len, nfiles = 0;
+ char **files = NULL;
+ struct stat sb;
+
+ *n = 0;
+ *argv = NULL;
FILE *fh = fopen(file_list,"r");
if ( !fh )
return 1;
}
- // Speed is not an issue here, determine the number of files by reading the file twice
- nfiles = 0;
- while ( fgets(buf,MAX_PATH_LEN,fh) ) nfiles++;
-
- if ( fseek(fh, 0L, SEEK_SET) )
- {
- fprintf(stderr,"%s: %s\n", file_list,strerror(errno));
- return 1;
- }
-
files = calloc(nfiles,sizeof(char*));
nfiles = 0;
while ( fgets(buf,MAX_PATH_LEN,fh) )
{
+ // allow empty lines and trailing spaces
len = strlen(buf);
while ( len>0 && isspace(buf[len-1]) ) len--;
if ( !len ) continue;
- files[nfiles] = malloc(sizeof(char)*(len+1));
- strncpy(files[nfiles],buf,len);
- files[nfiles][len] = 0;
+ // check sanity of the file list
+ buf[len] = 0;
+ if (stat(buf, &sb) != 0)
+ {
+ // no such file, check if it is safe to print its name
+ int i, safe_to_print = 1;
+ for (i=0; i<len; i++)
+ if (!isprint(buf[i])) { safe_to_print = 0; break; }
+ if ( safe_to_print )
+ fprintf(stderr,"The file list \"%s\" appears broken, could not locate: %s\n", file_list,buf);
+ else
+ fprintf(stderr,"Does the file \"%s\" really contain a list of files and do all exist?\n", file_list);
+ return 1;
+ }
+
nfiles++;
+ files = realloc(files,nfiles*sizeof(char*));
+ files[nfiles-1] = strdup(buf);
}
fclose(fh);
if ( !nfiles )
mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100;
mplp.min_frac = 0.002; mplp.min_support = 1;
mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN;
- while ((c = getopt(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsV")) >= 0) {
+ static struct option lopts[] =
+ {
+ {"rf",1,0,1}, // require flag
+ {"ff",1,0,2}, // filter flag
+ {0,0,0,0}
+ };
+ while ((c = getopt_long(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsV1:2:",lopts,NULL)) >= 0) {
switch (c) {
+ case 1 : mplp.rflag_require = strtol(optarg,0,0); break;
+ case 2 : mplp.rflag_filter = strtol(optarg,0,0); break;
case 'f':
mplp.fai = fai_load(optarg);
if (mplp.fai == 0) return 1;
fprintf(stderr, " -6 assume the quality is in the Illumina-1.3+ encoding\n");
fprintf(stderr, " -A count anomalous read pairs\n");
fprintf(stderr, " -B disable BAQ computation\n");
- fprintf(stderr, " -b FILE list of input BAM files [null]\n");
+ fprintf(stderr, " -b FILE list of input BAM filenames, one per line [null]\n");
fprintf(stderr, " -C INT parameter for adjusting mapQ; 0 to disable [0]\n");
fprintf(stderr, " -d INT max per-BAM depth to avoid excessive memory usage [%d]\n", mplp.max_depth);
fprintf(stderr, " -E recalculate extended BAQ on the fly thus ignoring existing BQs\n");
fprintf(stderr, " -R ignore RG tags\n");
fprintf(stderr, " -q INT skip alignments with mapQ smaller than INT [%d]\n", mplp.min_mq);
fprintf(stderr, " -Q INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp.min_baseQ);
+ fprintf(stderr, " --rf INT required flags: skip reads with mask bits unset []\n");
+ fprintf(stderr, " --ff INT filter flags: skip reads with mask bits set []\n");
fprintf(stderr, "\nOutput options:\n\n");
fprintf(stderr, " -D output per-sample DP in BCF (require -g/-u)\n");
fprintf(stderr, " -g generate BCF output (genotype likelihoods)\n");
/* Contact: Heng Li <lh3@sanger.ac.uk> */
/*
+ 2012-12-11 (0.1.4):
+
+ * Defined __ks_insertsort_##name as static to compile with C99.
+
2008-11-16 (0.1.4):
* Fixed a bug in introsort() that happens in rare cases.
tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \
} \
} \
- inline void __ks_insertsort_##name(type_t *s, type_t *t) \
+ static inline void __ks_insertsort_##name(type_t *s, type_t *t) \
{ \
type_t *i, *j, swap_tmp; \
for (i = s + 1; i < t; ++i) \
lib:
bamcheck:bamcheck.o
- $(CC) $(CFLAGS) -o $@ bamcheck.o -lm -lz -L.. -lbam -lpthread
+ $(CC) $(CFLAGS) -o $@ bamcheck.o -L.. -lm -lbam -lpthread -lz
bamcheck.o:bamcheck.c ../faidx.h ../khash.h ../sam.h ../razf.h
$(CC) $(CFLAGS) -c -I.. -o $@ bamcheck.c
uint64_t nreads_unmapped;
uint64_t nreads_unpaired;
uint64_t nreads_paired;
+ uint64_t nreads_anomalous;
uint64_t nreads_mq0;
uint64_t nbases_mapped;
uint64_t nbases_mapped_cigar;
uint64_t nbases_trimmed; // bwa trimmed bases
uint64_t nmismatches;
+ uint64_t nreads_QCfailed, nreads_secondary;
// GC-depth related data
uint32_t ngcd, igcd; // The maximum number of GC depth bins and index of the current bin
{
uint8_t qual = quals[iread] + 1;
if ( qual>=stats->nquals )
- error("TODO: quality too high %d>=%d\n", quals[iread],stats->nquals);
+ error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
int idx = is_fwd ? icycle : read_len-icycle-1;
if ( idx>stats->max_len )
return;
if ( stats->flag_filter && (bam_line->core.flag & stats->flag_filter) )
return;
-
if ( !is_in_regions(bam_line,stats) )
return;
+ if ( stats->filter_readlen!=-1 && bam_line->core.l_qseq!=stats->filter_readlen )
+ return;
+
+ if ( bam_line->core.flag & BAM_FQCFAIL ) stats->nreads_QCfailed++;
+ if ( bam_line->core.flag & BAM_FSECONDARY ) stats->nreads_secondary++;
int seq_len = bam_line->core.l_qseq;
if ( !seq_len ) return;
- if ( stats->filter_readlen!=-1 && seq_len!=stats->filter_readlen ) return;
+
if ( seq_len >= stats->nbases )
realloc_buffers(stats,seq_len);
if ( stats->max_len<seq_len )
{
uint8_t qual = bam_quals[ reverse ? seq_len-i-1 : i];
if ( qual>=stats->nquals )
- error("TODO: quality too high %d>=%d\n", quals[i],stats->nquals);
+ error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
if ( qual>stats->max_qual )
stats->max_qual = qual;
count_indels(stats,bam_line);
- // The insert size is tricky, because for long inserts the libraries are
- // prepared differently and the pairs point in other direction. BWA does
- // not set the paired flag for them. Similar thing is true also for 454
- // reads. Therefore, do the insert size stats for all mapped reads.
- int32_t isize = bam_line->core.isize;
- if ( isize<0 ) isize = -isize;
- if ( IS_PAIRED(bam_line) && isize!=0 )
+ if ( !IS_PAIRED(bam_line) )
+ stats->nreads_unpaired++;
+ else
{
stats->nreads_paired++;
- if ( isize >= stats->nisize )
- isize=stats->nisize-1;
- int pos_fst = bam_line->core.mpos - bam_line->core.pos;
- int is_fst = IS_READ1(bam_line) ? 1 : -1;
- int is_fwd = IS_REVERSE(bam_line) ? -1 : 1;
- int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1;
+ if ( bam_line->core.tid!=bam_line->core.mtid )
+ stats->nreads_anomalous++;
- if ( is_fwd*is_mfwd>0 )
- stats->isize_other[isize]++;
- else if ( is_fst*pos_fst>0 )
- {
- if ( is_fst*is_fwd>0 )
- stats->isize_inward[isize]++;
- else
- stats->isize_outward[isize]++;
- }
- else if ( is_fst*pos_fst<0 )
+ // The insert size is tricky, because for long inserts the libraries are
+ // prepared differently and the pairs point in other direction. BWA does
+ // not set the paired flag for them. Similar thing is true also for 454
+ // reads. Mates mapped to different chromosomes have isize==0.
+ int32_t isize = bam_line->core.isize;
+ if ( isize<0 ) isize = -isize;
+ if ( isize >= stats->nisize )
+ isize = stats->nisize-1;
+ if ( isize>0 || bam_line->core.tid==bam_line->core.mtid )
{
- if ( is_fst*is_fwd>0 )
- stats->isize_outward[isize]++;
- else
- stats->isize_inward[isize]++;
+ int pos_fst = bam_line->core.mpos - bam_line->core.pos;
+ int is_fst = IS_READ1(bam_line) ? 1 : -1;
+ int is_fwd = IS_REVERSE(bam_line) ? -1 : 1;
+ int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1;
+
+ if ( is_fwd*is_mfwd>0 )
+ stats->isize_other[isize]++;
+ else if ( is_fst*pos_fst>0 )
+ {
+ if ( is_fst*is_fwd>0 )
+ stats->isize_inward[isize]++;
+ else
+ stats->isize_outward[isize]++;
+ }
+ else if ( is_fst*pos_fst<0 )
+ {
+ if ( is_fst*is_fwd>0 )
+ stats->isize_outward[isize]++;
+ else
+ stats->isize_inward[isize]++;
+ }
}
}
- else
- stats->nreads_unpaired++;
// Number of mismatches
uint8_t *nm = bam_aux_get(bam_line,"NM");
// Calculate average insert size and standard deviation (from the main bulk data only)
int isize, ibulk=0;
uint64_t nisize=0, nisize_inward=0, nisize_outward=0, nisize_other=0;
- for (isize=1; isize<stats->nisize; isize++)
+ for (isize=0; isize<stats->nisize; isize++)
{
// Each pair was counted twice
stats->isize_inward[isize] *= 0.5;
}
double bulk=0, avg_isize=0, sd_isize=0;
- for (isize=1; isize<stats->nisize; isize++)
+ for (isize=0; isize<stats->nisize; isize++)
{
bulk += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
avg_isize += isize * (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]);
printf("SN\treads paired:\t%ld\n", (long)stats->nreads_paired);
printf("SN\treads duplicated:\t%ld\n", (long)stats->nreads_dup);
printf("SN\treads MQ0:\t%ld\n", (long)stats->nreads_mq0);
+ printf("SN\treads QC failed:\t%ld\n", (long)stats->nreads_QCfailed);
+ printf("SN\tnon-primary alignments:\t%ld\n", (long)stats->nreads_secondary);
printf("SN\ttotal length:\t%ld\n", (long)stats->total_len);
printf("SN\tbases mapped:\t%ld\n", (long)stats->nbases_mapped);
printf("SN\tbases mapped (cigar):\t%ld\n", (long)stats->nbases_mapped_cigar);
printf("SN\tinward oriented pairs:\t%ld\n", (long)nisize_inward);
printf("SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward);
printf("SN\tpairs with other orientation:\t%ld\n", (long)nisize_other);
+ printf("SN\tpairs on different chromosomes:\t%ld\n", (long)stats->nreads_anomalous/2);
int ibase,iqual;
if ( stats->max_len<stats->nbases ) stats->max_len++;
printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum);
}
printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n");
- for (isize=1; isize<ibulk; isize++)
+ for (isize=0; isize<ibulk; isize++)
printf("IS\t%d\t%ld\t%ld\t%ld\t%ld\n", isize, (long)(stats->isize_inward[isize]+stats->isize_outward[isize]+stats->isize_other[isize]),
(long)stats->isize_inward[isize], (long)stats->isize_outward[isize], (long)stats->isize_other[isize]);
stats_t *stats = calloc(1,sizeof(stats_t));
stats->ngc = 200;
- stats->nquals = 95;
+ stats->nquals = 256;
stats->nbases = 300;
stats->nisize = 8000;
stats->max_len = 30;
stats->gcd_bin_size = 20e3;
stats->gcd_ref_size = 4.2e9;
stats->rseq_pos = -1;
- stats->tid = stats->gcd_pos = stats->igcd = -1;
+ stats->tid = stats->gcd_pos = -1;
+ stats->igcd = 0;
stats->is_sorted = 1;
stats->cov_min = 1;
stats->cov_max = 1000;