+sub filter4vcf {
+ my %opts = (d=>3, D=>2000, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, Q=>10, q=>3);
+ getopts('d:D:1:2:3:4:Q:q:', \%opts);
+ die(qq/
+Usage: vcfutils.pl filter4vcf [options] <in.vcf>
+
+Options: -d INT min total depth (given DP or DP4) [$opts{d}]
+ -D INT max total depth [$opts{D}]
+ -q INT min SNP quality [$opts{q}]
+ -Q INT min RMS mapQ (given MQ) [$opts{Q}]
+ -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}]
+ -2 FLOAT min P-value for baseQ bias [$opts{2}]
+ -3 FLOAT min P-value for mapQ bias [$opts{3}]
+ -4 FLOAT min P-value for end distance bias [$opts{4}]\n
+/) if (@ARGV == 0 && -t STDIN);
+
+ my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
+
+ my @n = (0, 0);
+ while (<>) {
+ next if (/^#/);
+ next if (/PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/ && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4}));
+ my $depth = -1;
+ $depth = $1 if (/DP=(\d+)/);
+ $depth = $1+$2+$3+$4 if (/DP4=(\d+),(\d+),(\d+),(\d+)/);
+ next if ($depth > 0 && ($depth < $opts{d} || $depth > $opts{D}));
+ next if (/MQ=(\d+)/ && $1 < $opts{Q});
+ my @t = split;
+ next if ($t[5] >= 0 && $t[5] < $opts{q});
+ ++$n[0];
+ my @s = split(',', $t[4]);
+ ++$n[1] if ($ts{$t[3].$s[0]});
+ print;
+ }
+}
+
+sub ucscsnp2vcf {
+ die("Usage: vcfutils.pl <in.ucsc.snp>\n") if (@ARGV == 0 && -t STDIN);
+ print "##fileformat=VCFv4.0\n";
+ print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"), "\n";
+ while (<>) {
+ my @t = split("\t");
+ my $indel = ($t[9] =~ /^[ACGT](\/[ACGT])+$/)? 0 : 1;
+ my $pos = $t[2] + 1;
+ my @alt;
+ push(@alt, $t[7]);
+ if ($t[6] eq '-') {
+ $t[9] = reverse($t[9]);
+ $t[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/;
+ }
+ my @a = split("/", $t[9]);
+ for (@a) {
+ push(@alt, $_) if ($_ ne $alt[0]);
+ }
+ if ($indel) {
+ --$pos;
+ for (0 .. $#alt) {
+ $alt[$_] =~ tr/-//d;
+ $alt[$_] = "N$alt[$_]";
+ }
+ }
+ my $ref = shift(@alt);
+ my $af = $t[13] > 0? ";AF=$t[13]" : '';
+ my $valid = ($t[12] eq 'unknown')? '' : ";valid=$t[12]";
+ my $info = "molType=$t[10];class=$t[11]$valid$af";
+ print join("\t", $t[1], $pos, $t[4], $ref, join(",", @alt), 0, '.', $info), "\n";
+ }
+}
+
+sub hapmap2vcf {
+ die("Usage: vcfutils.pl <in.ucsc.snp> <in.hapmap>\n") if (@ARGV == 0);
+ my $fn = shift(@ARGV);
+ # parse UCSC SNP
+ warn("Parsing UCSC SNPs...\n");
+ my ($fh, %map);
+ open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
+ while (<$fh>) {
+ my @t = split;
+ next if ($t[3] - $t[2] != 1); # not SNP
+ @{$map{$t[4]}} = @t[1,3,7];
+ }
+ close($fh);
+ # write VCF
+ warn("Writing VCF...\n");
+ print "##fileformat=VCFv4.0\n";
+ while (<>) {
+ my @t = split;
+ if ($t[0] eq 'rs#') { # the first line
+ print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", @t[11..$#t]), "\n";
+ } else {
+ next unless ($map{$t[0]});
+ next if (length($t[1]) != 3); # skip non-SNPs
+ my $a = \@{$map{$t[0]}};
+ my $ref = $a->[2];
+ my @u = split('/', $t[1]);
+ if ($u[1] eq $ref) {
+ $u[1] = $u[0]; $u[0] = $ref;
+ } elsif ($u[0] ne $ref) { next; }
+ my $alt = $u[1];
+ my %w;
+ $w{$u[0]} = 0; $w{$u[1]} = 1;
+ my @s = (@$a[0,1], $t[0], $ref, $alt, 0, '.', '.', 'GT');
+ my $is_tri = 0;
+ for (@t[11..$#t]) {
+ if ($_ eq 'NN') {
+ push(@s, './.');
+ } else {
+ my @a = ($w{substr($_,0,1)}, $w{substr($_,1,1)});
+ if (!defined($a[0]) || !defined($a[1])) {
+ $is_tri = 1;
+ last;
+ }
+ push(@s, "$a[0]/$a[1]");
+ }
+ }
+ next if ($is_tri);
+ print join("\t", @s), "\n";
+ }
+ }
+}
+