2 # vcf_add_snp_info adds additional snp information to vcf files, and
3 # is released under the terms of the GPL version 2, or any later
4 # version, at your option. See the file README and COPYING for more
6 # Copyright 2011 by Don Armstrong <don@donarmstrong.com>.
19 vcf_add_snp_info - Add additional snp information to vcf files
26 --debug, -d debugging level (Default 0)
27 --help, -h display this help
28 --man, -m display manual
36 Debug verbosity. (Default 0)
40 Display brief usage information.
50 vcf_fill_rsid foo.vcf > bar.vcf
58 use Scalar::Util qw(looks_like_number);
61 my %options = (debug => 0,
73 'debug|d+','help|h|?','man|m');
75 pod2usage() if $options{help};
76 pod2usage({verbose=>2}) if $options{man};
78 $DEBUG = $options{debug};
82 # push @USAGE_ERRORS,"You must pass something";
85 pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
88 my $dbh = DBI->connect("dbi:Pg:service=snp",'','',{AutoCommit => 0}) or
89 die "Unable to connect to database: ".$DBI::errstr;
91 my $sth = $dbh->prepare(<<"END") // die "Unable to prepare snpinfo statement: ".$dbh->errstr;
92 SELECT snp_id FROM b$options{build}_snpchrposonref_$options{refseq} WHERE chr=? AND pos=?;
95 my $real_snp_sth = $dbh->prepare(<<'END') // die "Unable to prepare realsnp statement: ".$dbh->errstr;
96 SELECT DISTINCT rscurrent FROM rsmergearch WHERE rslow=? OR rshigh=?
99 my $sth_mrna_ss = $dbh->prepare(<<'END') // die "Unable to prepare mrna_ss statement: ".$dbh->errstr;
100 SELECT * FROM mrna WHERE start<=? AND stop>=? AND chr=? ORDER BY variant;
103 my $sth_cds_ss = $dbh->prepare(<<'END') // die "Unable to prepare cds_ss statement: ".$dbh->errstr;
104 SELECT * FROM cds WHERE start<=? AND stop>=? AND chr=? ORDER BY variant;
107 my $sth_mrna_cds = $dbh->prepare(<<'END') // die "Unable to prepare mrna_cds statement: ".$dbh->errstr;
108 SELECT * FROM mrna_cds_table WHERE cds_gi=?;
111 my $sth_snp_history = $dbh->prepare(<<'END') // die "Unable to prepare snp_history statement: ".$dbh->errstr;
112 SELECT * FROM snphistory WHERE snp_id=?;
116 my $sth_allele_freq = $dbh->prepare(<<'END') // die "Unable to prepare allele_freq statement: ".$dbh->errstr;
117 SELECT ssl.snp_id AS snp_id,
118 si.subsnp_id AS subsnp_id,
119 -- ssl.substrand_reversed_flag,
121 si.submitted_ind_id AS submitted_ind_id,
122 ga.chr_num AS chr_num,
123 -- si.submitted_strand_code,
125 ga.rev_flag AS rev_flag,
127 ra.allele AS rev_allele,
128 p.handle AS pop_handle,
129 p.loc_pop_id AS loc_pop_id
130 FROM snpsubsnplink ssl
131 JOIN subind si on ssl.subsnp_id=si.subsnp_id
132 JOIN gtyallele ga on si.gty_id=ga.gty_id
133 JOIN allele a on ga.fwd_allele_id = a.allele_id
134 JOIN allele ra on a.rev_allele_id = ra.allele_id
135 JOIN submittedindividual sind on si.submitted_ind_id=sind.submitted_ind_id
136 JOIN population p on sind.pop_id=p.pop_id
137 WHERE ssl.snp_id=? AND
138 si.chr=? AND ga.rev_flag = '0' AND
142 my $sth_rsid_info = $dbh->prepare(<<'END') // die "Unable to prepare rsid info statement: ".$dbh->errstr;
143 SELECT * FROM cds WHERE start<=? AND stop>=? AND chr=? ORDER BY variant;
146 my $sth_ga_info = $dbh->prepare(<<'END') // die "Unable to prepare ga_snp info statement: ".$dbh->errstr;
147 SELECT * FROM ga_snp WHERE snp_id=?;
154 my ($codon,$aa,$aa_med,$aa_long) = split /\t/;
155 $prot_table->{$codon} = $aa;
158 my @headers = qw(chr pos id ref alt qual filter info format);
160 print $_ and next if /^#/;
162 my @row = split /\t/, $_;
164 @row{@headers} = @row[0..$#headers];
165 # this is taken directly from vcf_fill_rsid; we want it for the time being anyway
166 if ($row{id} !~ /^(?:rs|ss)?\d+(?:,\s*(?:rs|ss)?\d+)*$/) {
167 $row{id} = find_rsid(chr => fixup_chr($row{chr}),
169 pos => $row{pos} - 1,
174 if ($row{id} =~ /(rs|ss)(\d+)/) {
177 $id = find_real_snp_id(id => $id,
179 sth => $real_snp_sth,
181 if (not defined $id and
182 is_retired_snp(id => $id,
184 sth => $sth_snp_history,
186 if (defined $row{chr} and defined $row{pos}) {
191 $row{id} = "$type$id";
194 my $mrna_information = mrna_information(dbh => $dbh,
196 # these are all 1 indexed
198 chr => fixup_chr($row{chr}),
201 cds_sth => $sth_cds_ss,
202 mrna_cds_sth => $sth_mrna_cds,
204 if (keys %{$mrna_information}) {
205 # strip out existing information in $row{info}
206 $row{info} =~ s{(?:GENE|(?:CDS|PROT|)POS|NUM|VAR|ACC|NONSYN|(?:ALT|REF)(?:AA|CODON))\=(?:[^\;]+|"[^"]+")(?:;|$)}{}g;
208 push @extra_info,"GENE=".$mrna_information->{gene};
209 push @extra_info,"NONSYN=".1 if defined $mrna_information->{nonsyn} and $mrna_information->{nonsyn};
210 push @extra_info,"REFAA=".$mrna_information->{orig_aa} if defined $mrna_information->{orig_aa};
211 push @extra_info,"ALTAA=".$mrna_information->{new_aa} if defined $mrna_information->{new_aa};
212 push @extra_info,"REFCODON=".$mrna_information->{orig_codon} if defined $mrna_information->{orig_codon};
213 push @extra_info,"ALTCODON=".$mrna_information->{new_codon} if defined $mrna_information->{new_codon};
214 push @extra_info,"CDSPOS=".$mrna_information->{cds_pos} if defined $mrna_information->{cds_pos};
215 push @extra_info,"PROTPOS=".$mrna_information->{prot_pos} if defined $mrna_information->{prot_pos};
216 push @extra_info,"POS=".($mrna_information->{exon}?'exon':'intron');
217 push @extra_info,"NUM=".$mrna_information->{number};
218 push @extra_info,"VAR=".$mrna_information->{variant} if defined $mrna_information->{variant};
219 push @extra_info,"ACC=".$mrna_information->{accession};
220 push @extra_info,"ACCVER=".$mrna_information->{accession_ver};
221 push @extra_info,"PHASE=".$mrna_information->{phase} if defined $mrna_information->{phase};
222 $row{info} = join(';',@extra_info,$row{info});
224 my $allele_freq_information = allele_freq_information(dbh => $dbh,
225 sth => $sth_allele_freq,
227 # these are all 1 indexed
229 chr => fixup_chr($row{chr}),
233 if (keys %{$allele_freq_information}) {
234 # strip out existing information in $row{info}
235 $row{info} =~ s{(?:REFFREQ|ALTFREQ|REFALTN)\=(?:[^\;]+|"[^"]+")(?:;|$)}{}g;
237 if (exists $allele_freq_information->{alleles} and keys %{$allele_freq_information->{alleles}}) {
238 push @extra_info,"REFFREQ=".sprintf('%.3f',allele_freq($allele_freq_information->{alleles},$row{ref}));
239 push @extra_info,"ALTFREQ=".sprintf('%.3f',allele_freq($allele_freq_information->{alleles},$row{alt}));
240 push @extra_info,"REFALTN=".$allele_freq_information->{alleles}{total};
242 $row{info} = join(';',@extra_info,$row{info});
244 my $ga_study_information = ga_study_information(dbh => $dbh,
248 if (defined $ga_study_information) {
249 # strip out existing information in $row{info}
250 $row{info} =~ s{(?:GASTUDY|GASTUDYSIG)\=(?:[^\;]+|"[^"]+")(?:;|$)}{}g;
252 push @extra_info,"GASTUDY=".$ga_study_information->{studies};
253 push @extra_info,"GASTUDYSIG=".($ga_study_information->{significant}?'TRUE':'FALSE');
254 $row{info} = join(';',@extra_info,$row{info});
256 print join("\t",@row{@headers},$#row > $#headers?@row[@headers..$#row]:())."\n";
257 if ($options{verbose} > 0 and $. % 100 == 0) {
258 print STDERR "At line $.\n";
262 sub ga_study_information {
265 return undef unless $param{id} =~ /^\d+$/;
266 my $rv = $param{sth}->execute($param{id}) or
267 die "Unable to execute statement properly: ".$param{dbh}->errstr;
268 my $results = $param{sth}->fetchall_arrayref({});
269 if (not @{$results}) {
272 my $any_significant=0;
274 for my $result (sort {
275 if (looks_like_number($a->{pvalue})) {
276 if (looks_like_number($b->{pvalue})) {
277 $a->{pvalue} <=> $b->{pvalue};
281 } elsif (looks_like_number($b->{pvalue})) {
286 if ($result->{significant}) {
289 my $value = $result->{study_name}.':'.$result->{subpart_name};
290 for my $t (qw(pvalue qvalue fdr)) {
291 next unless exists $result->{$t} and defined $result->{$t} and length $result->{$t};
292 my ($fl) = $t =~ m/^(.)/;
293 $value .= ":$fl:".sprintf('%.3g',$result->{$t});
295 push @values, $value;
297 return {significant => $any_significant,
298 studies => join(',',@values),
303 sub mrna_information{
306 my $rv = $param{sth}->execute($param{pos},$param{pos},$param{chr}) or
307 die "Unable to execute statement properly: ".$param{dbh}->errstr;
309 my $results = $param{sth}->fetchall_arrayref({});
310 if ($param{sth}->err) {
311 print STDERR $param{sth}->errstr;
315 for my $result (@{$results}) {
316 if (not defined $chosen_result) {
317 $chosen_result = $result;
320 if (not defined $chosen_result->{variant} or
321 (defined $result->{variant} and
322 $chosen_result->{variant} > $result->{variant})) {
323 $chosen_result = $result;
327 if (defined $chosen_result and $chosen_result->{exon}) {
328 # try to figure out the cds if there is mrna information and
330 my $cds_rv = $param{cds_sth}->execute($param{pos},$param{pos},$param{chr}) or
331 die "Unable to execute statement properly: ".$param{dbh}->errstr;
332 my $cds_results = $param{cds_sth}->fetchall_arrayref({});
333 if ($param{cds_sth}->err) {
334 print STDERR $param{cds_sth}->errstr;
337 print STDERR Data::Dumper->Dump([$cds_results],[qw(cds_results)]) if $DEBUG;
339 for my $cds_result (@{$cds_results}) {
340 if (not defined $chosen_cds) {
341 $chosen_cds = $cds_result;
344 if (not defined $cds_result->{variant} or
345 (defined $chosen_cds->{variant} and
346 $chosen_cds->{variant} ge $cds_result->{variant})) {
347 $chosen_cds = $cds_result;
351 if (defined $chosen_cds and keys %{$chosen_cds}) {
352 my $comp_offset = $chosen_cds->{complement}?-1:1;
353 $chosen_result->{in_cds} = 1;
354 $chosen_result->{cds_pos} =
355 ($chosen_cds->{complement}?$chosen_cds->{stop}-$param{pos}:$param{pos}-$chosen_cds->{start}) +
356 $chosen_cds->{position};
357 $chosen_result->{gene} = $chosen_cds->{gene};
358 $chosen_result->{pos} = $chosen_cds->{pos};
359 $chosen_result->{number} = $chosen_cds->{number};
360 $chosen_result->{variant} = $chosen_cds->{variant};
361 $chosen_result->{acc} = $chosen_cds->{accession};
362 $chosen_result->{accver} = $chosen_cds->{accession_ver};
363 $chosen_result->{prot_pos} = sprintf("%.1f",$chosen_result->{cds_pos}/ 3);
364 $chosen_result->{phase} = ($chosen_cds->{phase} + $param{pos} - $chosen_cds->{start}) % 3;
365 print STDERR Data::Dumper->Dump([$chosen_cds,$chosen_result],[qw(chosen_cds chosen_result)]) if $DEBUG;
366 my $mrna_cds_rv = $param{mrna_cds_sth}->execute($chosen_cds->{gi}) or
367 die "Unable to execute statement properly: ".$param{dbh}->errstr;
368 my $mrna_cds_result = $param{mrna_cds_sth}->fetchrow_hashref();
369 if (defined $mrna_cds_result and keys %{$mrna_cds_result}) {
370 print STDERR Data::Dumper->Dump([$mrna_cds_result,$chosen_result,\%param],
371 [qw(mrna_cds_result chosen_result *param)]) if $DEBUG;
372 my $codon = substr $mrna_cds_result->{sequence},$chosen_result->{cds_pos} - (($chosen_result->{cds_pos}) % 3) ,3;
373 my $orig_aa = convert_to_prot_seq($codon);
374 my $modified_codon = $codon;
375 substr($modified_codon,$chosen_result->{cds_pos} % 3,1)=
376 ($chosen_cds->{complement}?dna_complement($param{alt}):$param{alt});
377 my $new_aa = convert_to_prot_seq($modified_codon);
378 print STDERR Dumper($mrna_cds_result,$chosen_result,$chosen_cds,{codon_pos => $chosen_result->{cds_pos} - ($chosen_result->{cds_pos} % 3),
379 codon=>$codon,modified_codon=>$modified_codon,pos=>$param{pos}}) if $DEBUG;
382 print STDERR map {$a++; ($a*3)." $_\n"} $mrna_cds_result->{sequence} =~ m/.../g;
384 $chosen_result->{orig_aa} = $orig_aa;
385 $chosen_result->{new_aa} = $new_aa;
386 $chosen_result->{new_codon} = $modified_codon;
387 $chosen_result->{orig_codon} = $codon;
388 $chosen_result->{nonsyn} = $new_aa ne $orig_aa ? 1 : 0;
393 return $chosen_result // {};
396 sub find_real_snp_id {
400 my $rv = $param{sth}->execute($param{id},$param{id}) //
401 die "Unable to execute statement properly: ".$param{dbh}->errstr;
402 my ($real_id) = map {ref $_ ?@{$_}:()} map {ref $_ ?@{$_}:()} $param{sth}->fetchall_arrayref([0]);
403 if ($param{sth}->err) {
404 print STDERR $param{sth}->errstr;
409 return $real_id // $param{id};
416 my $rv = $param{sth}->execute($param{id}) //
417 die "Unable to execute statement properly: ".$param{dbh}->errstr;
418 if ($param{sth}->rows >= 1) {
428 $chr =~ s/^NC_00000?//;
430 if (looks_like_number($chr)) {
441 sub allele_freq_information{
444 return {} if $param{id} !~ /^\d+$/;
446 my $rv = $param{sth}->execute($param{id},$param{chr}) or
447 die "Unable to execute statement properly: ".$param{dbh}->errstr." id:".$param{id}." chr:".$param{chr};
449 my $results = $param{sth}->fetchall_arrayref({});
450 if ($param{sth}->err) {
451 print STDERR $param{sth}->errstr;
458 for my $result (@{$results}) {
459 next if exists $seen_indiv{$result->{submitted_ind_id}.'.'.$result->{chr_num}};
460 $indivs{$result->{submitted_ind_id}}{pop_handle} = $result->{pop_handle};
461 $indivs{$result->{submitted_ind_id}}{loc_pop_id} = $result->{loc_pop_id};
462 $indivs{$result->{submitted_ind_id}}{'chr_'.$result->{chr_num}} = $result->{allele};
463 if (defined $result->{loc_pop_id}) {
464 $pop_alleles{$result->{loc_pop_id}}{$result->{allele}}++;
465 $pop_alleles{$result->{loc_pop_id}}{total}++;
467 $alleles{$result->{allele}}++;
469 $seen_indiv{$result->{submitted_ind_id}.'.'.$result->{chr_num}} = 1;
473 for my $indiv (values %indivs) {
474 if (defined $indiv->{chr_1} and defined $indiv->{chr_2}) {
475 my $genotype = join('/',
476 sort($indiv->{chr_1},
478 $pop_genotypes{$indiv->{loc_pop_id}}{$genotype}++;
479 $pop_genotypes{$indiv->{loc_pop_id}}{total}++;
480 $genotypes{$genotype}++;
484 return {pop_alleles => \%pop_alleles,
485 pop_genotypes => \%pop_genotypes,
486 genotypes => \%genotypes,
487 alleles => \%alleles,
492 my ($freq_table,$allele) = @_;
493 if (defined $allele) {
494 return ((exists $freq_table->{$allele} and defined $freq_table->{$allele}) ?
495 $freq_table->{$allele}/$freq_table->{total} : 0);
497 my @alleles = grep {$_ ne 'total'} keys %{$freq_table};
498 return join(',',map {$_.':'.(defined $freq_table->{$_} ?
499 $freq_table->{$_}/$freq_table->{total} : 0)} @alleles);
506 my $rv = $param{sth}->execute($param{chr},$param{pos}) //
507 die "Unable to execute statement properly: ".$param{dbh}->errstr;
508 my @snp_ids = grep {$_ =~ /^\d+$/}
509 map {ref $_ eq 'ARRAY'? @{$_}:$_}
510 map {ref $_ eq 'ARRAY'? @{$_}:$_}
511 $sth->fetchall_arrayref([0]);
513 print STDERR $sth->errstr;
519 return join(',',@snp_ids);
524 return join('',map {tr /ATCG/TAGC/; $_} reverse split //,$seq);
527 sub convert_to_prot_seq{
531 for my $pos (0..(length($mrna) / 3 - 1)) {
532 my $codon = substr($mrna,$pos*3,3);
533 if (not exists $prot_table->{$codon}) {
534 # this isn't quite right, as we should really be handling
535 # ambiguities which do not result in different amino
536 # acids, but for the time being, do it this way.
540 $prot .= $prot_table->{$codon};
561 TTT F Phe Phenylalanine
562 TTC F Phe Phenylalanine
597 GAA E Glu Glutamic acid
598 GAG E Glu Glutamic acid
599 GAT D Asp Aspartic acid
600 GAC D Asp Aspartic acid