+ print STDERR join("\t", substr("UQdDaGgPMS", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
+ }
+}
+
+sub gapstats {
+ my (@c0, @c1);
+ $c0[$_] = $c1[$_] = 0 for (0 .. 10000);
+ while (<>) {
+ next if (/^#/);
+ my @t = split;
+ next if (length($t[3]) == 1 && $t[4] =~ /^[A-Za-z](,[A-Za-z])*$/); # not an indel
+ my @s = split(',', $t[4]);
+ for my $x (@s) {
+ my $l = length($x) - length($t[3]) + 5000;
+ if ($x =~ /^-/) {
+ $l = -(length($x) - 1) + 5000;
+ } elsif ($x =~ /^\+/) {
+ $l = length($x) - 1 + 5000;
+ }
+ $c0[$l] += 1 / @s;
+ }
+ }
+ for (my $i = 0; $i < 10000; ++$i) {
+ next if ($c0[$i] == 0);
+ $c1[0] += $c0[$i];
+ $c1[1] += $c0[$i] if (($i-5000)%3 == 0);
+ printf("C\t%d\t%.2f\n", ($i-5000), $c0[$i]);
+ }
+ printf("3\t%d\t%d\t%.3f\n", $c1[0], $c1[1], $c1[1]/$c1[0]);
+}
+
+sub ucscsnp2vcf {
+ die("Usage: vcfutils.pl <in.ucsc.snp>\n") if (@ARGV == 0 && -t STDIN);
+ print "##fileformat=VCFv4.0\n";
+ print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"), "\n";
+ while (<>) {
+ my @t = split("\t");
+ my $indel = ($t[9] =~ /^[ACGT](\/[ACGT])+$/)? 0 : 1;
+ my $pos = $t[2] + 1;
+ my @alt;
+ push(@alt, $t[7]);
+ if ($t[6] eq '-') {
+ $t[9] = reverse($t[9]);
+ $t[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/;
+ }
+ my @a = split("/", $t[9]);
+ for (@a) {
+ push(@alt, $_) if ($_ ne $alt[0]);
+ }
+ if ($indel) {
+ --$pos;
+ for (0 .. $#alt) {
+ $alt[$_] =~ tr/-//d;
+ $alt[$_] = "N$alt[$_]";
+ }
+ }
+ my $ref = shift(@alt);
+ my $af = $t[13] > 0? ";AF=$t[13]" : '';
+ my $valid = ($t[12] eq 'unknown')? '' : ";valid=$t[12]";
+ my $info = "molType=$t[10];class=$t[11]$valid$af";
+ print join("\t", $t[1], $pos, $t[4], $ref, join(",", @alt), 0, '.', $info), "\n";
+ }
+}
+
+sub hapmap2vcf {
+ die("Usage: vcfutils.pl <in.ucsc.snp> <in.hapmap>\n") if (@ARGV == 0);
+ my $fn = shift(@ARGV);
+ # parse UCSC SNP
+ warn("Parsing UCSC SNPs...\n");
+ my ($fh, %map);
+ open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
+ while (<$fh>) {
+ my @t = split;
+ next if ($t[3] - $t[2] != 1); # not SNP
+ @{$map{$t[4]}} = @t[1,3,7];
+ }
+ close($fh);
+ # write VCF
+ warn("Writing VCF...\n");
+ print "##fileformat=VCFv4.0\n";
+ while (<>) {
+ my @t = split;
+ if ($t[0] eq 'rs#') { # the first line
+ print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", @t[11..$#t]), "\n";
+ } else {
+ next unless ($map{$t[0]});
+ next if (length($t[1]) != 3); # skip non-SNPs
+ my $a = \@{$map{$t[0]}};
+ my $ref = $a->[2];
+ my @u = split('/', $t[1]);
+ if ($u[1] eq $ref) {
+ $u[1] = $u[0]; $u[0] = $ref;
+ } elsif ($u[0] ne $ref) { next; }
+ my $alt = $u[1];
+ my %w;
+ $w{$u[0]} = 0; $w{$u[1]} = 1;
+ my @s = (@$a[0,1], $t[0], $ref, $alt, 0, '.', '.', 'GT');
+ my $is_tri = 0;
+ for (@t[11..$#t]) {
+ if ($_ eq 'NN') {
+ push(@s, './.');
+ } else {
+ my @a = ($w{substr($_,0,1)}, $w{substr($_,1,1)});
+ if (!defined($a[0]) || !defined($a[1])) {
+ $is_tri = 1;
+ last;
+ }
+ push(@s, "$a[0]/$a[1]");
+ }
+ }
+ next if ($is_tri);
+ print join("\t", @s), "\n";
+ }
+ }
+}
+
+sub vcf2fq {
+ my %opts = (d=>3, D=>100000, Q=>10, l=>5);
+ getopts('d:D:Q:l:', \%opts);
+ die(qq/
+Usage: vcfutils.pl vcf2fq [options] <all-site.vcf>
+
+Options: -d INT minimum depth [$opts{d}]
+ -D INT maximum depth [$opts{D}]
+ -Q INT min RMS mapQ [$opts{Q}]
+ -l INT INDEL filtering window [$opts{l}]
+\n/) if (@ARGV == 0 && -t STDIN);
+
+ my ($last_chr, $seq, $qual, $last_pos, @gaps);
+ my $_Q = $opts{Q};
+ my $_d = $opts{d};
+ my $_D = $opts{D};
+
+ my %het = (AC=>'M', AG=>'R', AT=>'W', CA=>'M', CG=>'S', CT=>'Y',
+ GA=>'R', GC=>'S', GT=>'K', TA=>'W', TC=>'Y', TG=>'K');
+
+ $last_chr = '';
+ while (<>) {
+ next if (/^#/);
+ my @t = split;
+ if ($last_chr ne $t[0]) {
+ &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}) if ($last_chr);
+ ($last_chr, $last_pos) = ($t[0], 0);
+ $seq = $qual = '';
+ @gaps = ();
+ }
+ die("[vcf2fq] unsorted input\n") if ($t[1] - $last_pos < 0);
+ if ($t[1] - $last_pos > 1) {
+ $seq .= 'n' x ($t[1] - $last_pos - 1);
+ $qual .= '!' x ($t[1] - $last_pos - 1);
+ }
+ if (length($t[3]) == 1 && $t[7] !~ /INDEL/ && $t[4] =~ /^([A-Za-z.])(,[A-Za-z])*$/) { # a SNP or reference
+ my ($ref, $alt) = ($t[3], $1);
+ my ($b, $q);
+ $q = $1 if ($t[7] =~ /FQ=(-?[\d\.]+)/);
+ if ($q < 0) {
+ $_ = ($t[7] =~ /AF1=([\d\.]+)/)? $1 : 0;
+ $b = ($_ < .5 || $alt eq '.')? $ref : $alt;
+ $q = -$q;
+ } else {
+ $b = $het{"$ref$alt"};
+ $b ||= 'N';
+ }
+ $b = lc($b);
+ $b = uc($b) if (($t[7] =~ /MQ=(\d+)/ && $1 >= $_Q) && ($t[7] =~ /DP=(\d+)/ && $1 >= $_d && $1 <= $_D));
+ $q = int($q + 33 + .499);
+ $q = chr($q <= 126? $q : 126);
+ $seq .= $b;
+ $qual .= $q;
+ } elsif ($t[4] ne '.') { # an INDEL
+ push(@gaps, [$t[1], length($t[3])]);
+ }
+ $last_pos = $t[1];
+ }
+ &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l});
+}
+
+sub v2q_post_process {
+ my ($chr, $seq, $qual, $gaps, $l) = @_;
+ for my $g (@$gaps) {
+ my $beg = $g->[0] > $l? $g->[0] - $l : 0;
+ my $end = $g->[0] + $g->[1] + $l;
+ $end = length($$seq) if ($end > length($$seq));
+ substr($$seq, $beg, $end - $beg) = lc(substr($$seq, $beg, $end - $beg));
+ }
+ print "\@$chr\n"; &v2q_print_str($seq);
+ print "+\n"; &v2q_print_str($qual);
+}
+
+sub v2q_print_str {
+ my ($s) = @_;
+ my $l = length($$s);
+ for (my $i = 0; $i < $l; $i += 60) {
+ print substr($$s, $i, 60), "\n";