sam_header.o:sam_header.h khash.h
bcf.o:bcftools/bcf.h
bam2bcf.o:bam2bcf.h errmod.h bcftools/bcf.h
+bam2bcf_indel.o:bam2bcf.h
errmod.o:errmod.h
faidx.o:faidx.h razf.h khash.h
char *inscns;
uint16_t *bases;
errmod_t *e;
+ void *rghash;
} bcf_callaux_t;
typedef struct {
int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call);
int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int is_SP,
const bcf_callaux_t *bca, const char *ref);
- int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref);
+ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref,
+ const void *rghash);
#ifdef __cplusplus
}
#include <assert.h>
#include <ctype.h>
+#include <string.h>
#include "bam.h"
#include "bam2bcf.h"
#include "ksort.h"
#include "kaln.h"
+#include "khash.h"
+KHASH_SET_INIT_STR(rg)
#define MINUS_CONST 0x10000000
#define INDEL_WINDOW_SIZE 50
+void *bcf_call_add_rg(void *_hash, const char *hdtext, const char *list)
+{
+ const char *s, *p, *q, *r, *t;
+ khash_t(rg) *hash;
+ if (list == 0 || hdtext == 0) return _hash;
+ if (_hash == 0) _hash = kh_init(rg);
+ hash = (khash_t(rg)*)_hash;
+ if ((s = strstr(hdtext, "@RG\t")) == 0) return hash;
+ do {
+ t = strstr(s + 4, "@RG\t"); // the next @RG
+ if ((p = strstr(s, "\tID:")) != 0) p += 4;
+ if ((q = strstr(s, "\tPL:")) != 0) q += 4;
+ if (p && q && (t == 0 || (p < t && q < t))) { // ID and PL are both present
+ int lp, lq;
+ char *x;
+ for (r = p; *r && *r != '\t' && *r != '\n'; ++r); lp = r - p;
+ for (r = q; *r && *r != '\t' && *r != '\n'; ++r); lq = r - q;
+ x = calloc((lp > lq? lp : lq) + 1, 1);
+ for (r = q; *r && *r != '\t' && *r != '\n'; ++r) x[r-q] = *r;
+ if (strstr(list, x)) { // insert ID to the hash table
+ khint_t k;
+ int ret;
+ for (r = p; *r && *r != '\t' && *r != '\n'; ++r) x[r-p] = *r;
+ x[r-p] = 0;
+ k = kh_get(rg, hash, x);
+ if (k == kh_end(hash)) k = kh_put(rg, hash, x, &ret);
+ else free(x);
+ } else free(x);
+ }
+ s = t;
+ } while (s);
+ return hash;
+}
+
+void bcf_call_del_rghash(void *_hash)
+{
+ khint_t k;
+ khash_t(rg) *hash = (khash_t(rg)*)_hash;
+ if (hash == 0) return;
+ for (k = kh_begin(hash); k < kh_end(hash); ++k)
+ if (kh_exist(hash, k))
+ free((char*)kh_key(hash, k));
+ kh_destroy(rg, hash);
+}
+
static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos)
{
int k, x = c->pos, y = 0, last_y = 0;
return max_i - pos;
}
-int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref)
+int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref,
+ const void *rghash)
{
extern void ks_introsort_uint32_t(int, uint32_t*);
int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins, *score, N, K, l_run, ref_type;
char *inscns = 0, *ref2, *query;
+ khash_t(rg) *hash = (khash_t(rg)*)rghash;
if (ref == 0 || bca == 0) return -1;
+ // mark filtered reads
+ if (rghash) {
+ N = 0;
+ for (s = N = 0; s < n; ++s) {
+ for (i = 0; i < n_plp[s]; ++i) {
+ bam_pileup1_t *p = plp[s] + i;
+ const uint8_t *rg = bam_aux_get(p->b, "RG");
+ p->aux = 1; // filtered by default
+ if (rg) {
+ khint_t k = kh_get(rg, hash, (const char*)(rg + 1));
+ if (k != kh_end(hash)) p->aux = 0, ++N; // not filtered
+ }
+ }
+ }
+ if (N == 0) return -1; // no reads left
+ }
// determine if there is a gap
for (s = N = 0; s < n; ++s) {
for (i = 0; i < n_plp[s]; ++i)
for (s = 0; s < n; ++s) {
for (i = 0; i < n_plp[s]; ++i) {
const bam_pileup1_t *p = plp[s] + i;
- if (p->indel != 0)
+ if (p->indel != 0 && (rghash == 0 || p->aux == 0))
aux[m++] = MINUS_CONST + p->indel;
j = bam_cigar2qlen(&p->b->core, bam1_cigar(p->b));
if (j > max_rd_len) max_rd_len = j;
// squeeze out identical types
for (i = 1, n_types = 1; i < m; ++i)
if (aux[i] != aux[i-1]) ++n_types;
- assert(n_types > 1); // there must at least one type of non-reference indel
+ if (n_types == 1) { // no indels
+ free(aux); return -1;
+ }
types = (int*)calloc(n_types, sizeof(int));
t = 0;
types[t++] = aux[0] - MINUS_CONST;
typedef struct {
int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth;
- char *reg, *fn_pos;
+ char *reg, *fn_pos, *pl_list;
faidx_t *fai;
kh_64_t *hash;
} mplp_conf_t;
static int mpileup(mplp_conf_t *conf, int n, char **fn)
{
+ extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list);
+ extern void bcf_call_del_rghash(void *rghash);
mplp_aux_t **data;
int i, tid, pos, *n_plp, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid, max_depth;
const bam_pileup1_t **plp;
bam_header_t *h = 0;
char *ref;
khash_t(64) *hash = 0;
+ void *rghash = 0;
bcf_callaux_t *bca = 0;
bcf_callret1_t *bcr = 0;
data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r");
h_tmp = bam_header_read(data[i]->fp);
bam_smpl_add(sm, fn[i], h_tmp->text);
+ rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list);
if (conf->reg) {
int beg, end;
bam_index_t *idx;
bcf_hdr_write(bp, bh);
bca = bcf_call_init(-1., conf->min_baseQ);
bcr = calloc(sm->n, sizeof(bcf_callret1_t));
+ bca->rghash = rghash;
}
ref_tid = -1; ref = 0;
iter = bam_mplp_init(n, mplp_func, (void**)data);
bcf_write(bp, bh, b);
bcf_destroy(b);
// call indels
- if (bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref) >= 0) {
+ if (bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) {
for (i = 0; i < gplp.n; ++i)
bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i);
bcf_call_combine(gplp.n, bcr, -1, &bc);
bam_smpl_destroy(sm); free(buf.s);
for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]);
free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp);
+ bcf_call_del_rghash(rghash);
if (hash) { // free the hash table
khint_t k;
for (k = kh_begin(hash); k < kh_end(hash); ++k)
mplp.capQ_thres = 0;
mplp.max_depth = 250;
mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN;
- while ((c = getopt(argc, argv, "gf:r:l:M:q:Q:uaORC:BDSd:b:")) >= 0) {
+ while ((c = getopt(argc, argv, "gf:r:l:M:q:Q:uaORC:BDSd:b:P:")) >= 0) {
switch (c) {
case 'f':
mplp.fai = fai_load(optarg);
case 'd': mplp.max_depth = atoi(optarg); break;
case 'r': mplp.reg = strdup(optarg); break;
case 'l': mplp.fn_pos = strdup(optarg); break;
+ case 'P': mplp.pl_list = strdup(optarg); break;
case 'g': mplp.flag |= MPLP_GLF; break;
case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_GLF; break;
case 'a': mplp.flag |= MPLP_NO_ORPHAN | MPLP_REALN; break;
fprintf(stderr, " -Q INT min base quality [%d]\n", mplp.min_baseQ);
fprintf(stderr, " -q INT filter out alignment with MQ smaller than INT [%d]\n", mplp.min_mq);
fprintf(stderr, " -d INT max per-sample depth [%d]\n", mplp.max_depth);
+ fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n");
fprintf(stderr, " -g generate BCF output\n");
fprintf(stderr, " -u do not compress BCF output\n");
fprintf(stderr, " -B disable BAQ computation\n");
}
else
mpileup(&mplp, argc - optind, argv + optind);
- free(mplp.reg);
+ free(mplp.reg); free(mplp.pl_list);
if (mplp.fai) fai_destroy(mplp.fai);
return 0;
}
#endif
#ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.1.9-7 (r804)"
+#define PACKAGE_VERSION "0.1.9-8 (r807)"
#endif
int bam_taf2baf(int argc, char *argv[]);
exit;
sub main {
- my $version = '0.1.0';
&usage if (@ARGV < 1);
my $command = shift(@ARGV);
my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter,
- hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&filter4vcf, ldstats=>\&ldstats);
+ hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&varFilter, ldstats=>\&ldstats);
die("Unknown command \"$command\".\n") if (!defined($func{$command}));
&{$func{$command}};
}
}
sub varFilter {
- my %opts = (d=>1, D=>10000, a=>2, l=>10, Q=>10, s=>100, w=>10, p=>undef, F=>.001);
- getopts('pd:D:l:Q:w:G:F:a:', \%opts);
+ my %opts = (d=>2, D=>10000, a=>2, W=>10, Q=>10, w=>10, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4);
+ getopts('pd:D:W:Q:w:a:1:2:3:4:', \%opts);
die(qq/
Usage: vcfutils.pl varFilter [options] <in.vcf>
-D INT maximum read depth [$opts{D}]
-a INT minimum number of alternate bases [$opts{a}]
-w INT SNP within INT bp around a gap to be filtered [$opts{w}]
- -l INT window size for filtering adjacent gaps [$opts{l}]
+ -W INT window size for filtering adjacent gaps [$opts{W}]
+ -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}]
+ -2 FLOAT min P-value for baseQ bias [$opts{2}]
+ -3 FLOAT min P-value for mapQ bias [$opts{3}]
+ -4 FLOAT min P-value for end distance bias [$opts{4}]
-p print filtered variants
+
+Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools.
\n/) if (@ARGV == 0 && -t STDIN);
# calculate the window size
- my ($ol, $ow) = ($opts{l}, $opts{w});
+ my ($ol, $ow) = ($opts{W}, $opts{w});
my $max_dist = $ol > $ow? $ol : $ow;
# the core loop
my @staging; # (indel_filtering_score, flt_tag, indel_span; chr, pos, ...)
while (<>) {
my @t = split;
- next if (/^#/);
+ if (/^#/) {
+ print; next;
+ }
next if ($t[4] eq '.'); # skip non-var sites
+ # check if the site is a SNP
my $is_snp = 1;
if (length($t[3]) > 1) {
$is_snp = 0;
varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
}
my $flt = 0;
-
# parse annotations
my ($dp, $mq, $dp_alt) = (-1, -1, -1);
if ($t[7] =~ /DP=(\d+)/i) {
}
$flt = 4 if ($dp_alt >= 0 && $dp_alt < $opts{a});
$flt = 1 if ($flt == 0 && $mq >= 0 && $mq < $opts{Q});
+ $flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/
+ && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4}));
# site dependent filters
my ($rlen, $indel_score) = (0, -1); # $indel_score<0 for SNPs
if ($first->[1] == 0) {
print join("\t", @$first[3 .. @$first-1]), "\n";
} elsif ($is_print) {
- print STDERR join("\t", substr("UQdDaGgsiX", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
- }
-}
-
-sub filter4vcf {
- my %opts = (d=>3, D=>2000, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, Q=>10, q=>3);
- getopts('d:D:1:2:3:4:Q:q:', \%opts);
- die(qq/
-Usage: vcfutils.pl filter4vcf [options] <in.vcf>
-
-Options: -d INT min total depth (given DP or DP4) [$opts{d}]
- -D INT max total depth [$opts{D}]
- -q INT min SNP quality [$opts{q}]
- -Q INT min RMS mapQ (given MQ) [$opts{Q}]
- -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}]
- -2 FLOAT min P-value for baseQ bias [$opts{2}]
- -3 FLOAT min P-value for mapQ bias [$opts{3}]
- -4 FLOAT min P-value for end distance bias [$opts{4}]\n
-/) if (@ARGV == 0 && -t STDIN);
-
- my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
-
- my @n = (0, 0);
- while (<>) {
- if (/^#/) {
- print;
- next;
- }
- next if (/PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/ && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4}));
- my $depth = -1;
- $depth = $1 if (/DP=(\d+)/);
- $depth = $1+$2+$3+$4 if (/DP4=(\d+),(\d+),(\d+),(\d+)/);
- next if ($depth > 0 && ($depth < $opts{d} || $depth > $opts{D}));
- next if (/MQ=(\d+)/ && $1 < $opts{Q});
- my @t = split;
- next if ($t[5] >= 0 && $t[5] < $opts{q});
- ++$n[0];
- my @s = split(',', $t[4]);
- ++$n[1] if ($ts{$t[3].$s[0]});
- print;
+ print STDERR join("\t", substr("UQdDaGgP", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
}
}
fillac fill the allele count field
qstats SNP stats stratified by QUAL
varFilter filtering short variants
- filter4vcf filtering VCFs produced by samtools+bcftools
hapmap2vcf convert the hapmap format to VCF
ucscsnp2vcf convert UCSC SNP SQL dump to VCF
\n/);