3 # VCF specs: http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf3.3
12 my $opts = parse_params();
13 do_pileup_to_vcf($opts);
22 if ( scalar @msg ) { croak(@msg); }
24 "Usage: sam2vcf.pl [OPTIONS] < in.pileup > out.vcf\n",
26 " -r, --refseq <file.fa> The reference sequence, required when indels are present.\n",
27 " -s, --snps-only Ignore indels.\n",
28 " -h, -?, --help This help message.\n",
37 $opts{fh_in} = *STDIN;
38 $opts{fh_out} = *STDOUT;
40 while (my $arg=shift(@ARGV))
42 if ( $arg eq '-r' || $arg eq '--refseq' ) { $opts{refseq}=shift(@ARGV); next; }
43 if ( $arg eq '-s' || $arg eq '--snps-only' ) { $opts{snps_only}=1; next; }
44 if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
46 error("Unknown parameter \"$arg\". Run -h for help.\n");
62 if ( !exists($iupac{$base}) )
64 if ( $base ne 'A' && $base ne 'C' && $base ne 'G' && $base ne 'T' ) { error("FIXME: what is this [$base]?\n"); }
65 if ( $ref eq $base ) { return ('.','0/0'); }
68 my $gt = $iupac{$base};
69 if ( $$gt[0] eq $ref ) { return ($$gt[1],'0/1'); }
70 elsif ( $$gt[1] eq $ref ) { return ($$gt[0],'0/1'); }
71 return ("$$gt[0],$$gt[1]",'1/2');
83 elsif ( $cons=~/^\+/ ) { return "I$'"; }
84 elsif ( $cons eq '*' ) { return undef; }
85 error("FIXME: could not parse [$cons]\n");
89 # An example of the pileup format:
90 # 1 3000011 C C 32 0 98 1 ^~, A
91 # 1 3002155 * +T/+T 53 119 52 5 +T * 4 1 0
92 # 1 3003094 * -TT/-TT 31 164 60 11 -TT * 5 6 0
93 # 1 3073986 * */-AAAAAAAAAAAAAA 3 3 45 9 * -AAAAAAAAAAAAAA 7 2 0
99 my $fh_in = $$opts{fh_in};
100 my $fh_out = $$opts{fh_out};
101 my ($prev_chr,$prev_pos,$prev_ref);
103 my $ignore_indels = $$opts{snps_only} ? 1 : 0;
106 qq[##fileformat=VCFv3.3\n],
107 qq[##INFO=DP,1,Integer,"Total Depth"\n],
108 qq[##FORMAT=GT,1,String,"Genotype"\n],
109 qq[##FORMAT=GQ,1,Integer,"Genotype Quality"\n],
110 qq[##FORMAT=DP,1,Integer,"Read Depth"\n],
111 qq[#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tdata\n]
114 while (my $line=<$fh_in>)
117 my ($chr,$pos,$ref,$cons,$cons_qual,$snp_qual,$rms_qual,$depth,@items) = split(/\t/,$line);
122 # An indel is involved.
123 if ( $ignore_indels ) { next; }
125 if ($chr ne $prev_chr || $pos ne $prev_pos)
127 if ( !$$opts{refseq} ) { error("Cannot do indels without the reference.\n"); }
128 if ( !$refseq ) { $refseq = Fasta->new(file=>$$opts{refseq}); }
129 $ref = $refseq->get_base($chr,$pos);
131 else { $ref = $prev_ref; }
133 # One of the alleles can be a reference and it can come in arbitrary order
134 my ($al1,$al2) = split(m{/},$cons);
135 my $alt1 = parse_indel($al1);
136 my $alt2 = parse_indel($al2);
137 if ( !$alt1 && !$alt2 ) { error("FIXME: could not parse indel:\n", $line); }
138 if ( $alt1 && $alt2 && $alt1 eq $alt2 ) { $alt2=''; }
158 ($alt,$gt) = iupac_to_gtype($ref,$cons);
161 print $fh_out "$chr\t$pos\t.\t$ref\t$alt\t$snp_qual\t0\tDP=$depth\tGT:GQ:DP\t$gt:$cons_qual:$depth\n";
170 #------------- Fasta --------------------
172 # Uses samtools to get a requested base from a fasta file. For efficiency, preloads
173 # a chunk to memory. The size of the cached sequence can be controlled by the 'size'
184 my ($class,@args) = @_;
186 bless $self, ref($class) || $class;
187 if ( !$$self{file} ) { $self->throw(qq[Missing the parameter "file"\n]); }
189 $$self{from} = undef;
191 if ( !$$self{size} ) { $$self{size}=10_000_000; }
192 bless $self, ref($class) || $class;
198 my ($self,$chr,$pos) = @_;
199 my $to = $pos + $$self{size};
200 my $cmd = "samtools faidx $$self{file} $chr:$pos-$to";
202 if ( $? ) { $self->throw("$cmd: $!"); }
203 my $line = shift(@out);
204 if ( !($line=~/^>$chr:(\d+)-(\d+)/) ) { $self->throw("Could not parse: $line"); }
209 while ($line=shift(@out))
214 $$self{chunk} = $chunk;
220 my ($self,$chr,$pos) = @_;
221 if ( !$$self{chr} || $chr ne $$self{chr} || $pos<$$self{from} || $pos>$$self{to} )
223 $self->read_chunk($chr,$pos);
225 my $idx = $pos - $$self{from};
226 return substr($$self{chunk},$idx,1);
231 my ($self,@msg) = @_;