#! /usr/bin/perl # vcf_add_snp_info adds additional snp information to vcf files, and # is released under the terms of the GPL version 2, or any later # version, at your option. See the file README and COPYING for more # information. # Copyright 2011 by Don Armstrong . use warnings; use strict; BEGIN {$|=1}; use Getopt::Long; use Pod::Usage; =head1 NAME vcf_add_snp_info - Add additional snp information to vcf files =head1 SYNOPSIS [options] Options: --debug, -d debugging level (Default 0) --help, -h display this help --man, -m display manual =head1 OPTIONS =over =item B<--debug, -d> Debug verbosity. (Default 0) =item B<--help, -h> Display brief usage information. =item B<--man, -m> Display this manual. =back =head1 EXAMPLES vcf_fill_rsid foo.vcf > bar.vcf =cut use vars qw($DEBUG); use DBI; use Data::Dumper; use Scalar::Util qw(looks_like_number); my %options = (debug => 0, help => 0, man => 0, verbose => 0, build => '135', refseq => '37_3', ); GetOptions(\%options, 'build', 'refseq', 'verbose|v+', 'debug|d+','help|h|?','man|m'); pod2usage() if $options{help}; pod2usage({verbose=>2}) if $options{man}; $DEBUG = $options{debug}; my @USAGE_ERRORS; # if (1) { # push @USAGE_ERRORS,"You must pass something"; # } pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS; my $dbh = DBI->connect("dbi:Pg:service=snp",'','',{AutoCommit => 0}) or die "Unable to connect to database: ".$DBI::errstr; my $sth = $dbh->prepare(<<"END") // die "Unable to prepare snpinfo statement: ".$dbh->errstr; SELECT snp_id FROM b$options{build}_snpchrposonref_$options{refseq} WHERE chr=? AND pos=?; END my $real_snp_sth = $dbh->prepare(<<'END') // die "Unable to prepare realsnp statement: ".$dbh->errstr; SELECT DISTINCT rscurrent FROM rsmergearch WHERE rslow=? OR rshigh=? END my $sth_mrna_ss = $dbh->prepare(<<'END') // die "Unable to prepare mrna_ss statement: ".$dbh->errstr; SELECT * FROM mrna WHERE start<=? AND stop>=? AND chr=? ORDER BY variant; END my $sth_cds_ss = $dbh->prepare(<<'END') // die "Unable to prepare cds_ss statement: ".$dbh->errstr; SELECT * FROM cds WHERE start<=? AND stop>=? AND chr=? ORDER BY variant; END my $sth_mrna_cds = $dbh->prepare(<<'END') // die "Unable to prepare mrna_cds statement: ".$dbh->errstr; SELECT * FROM mrna_cds_table WHERE cds_gi=?; END my $sth_snp_history = $dbh->prepare(<<'END') // die "Unable to prepare snp_history statement: ".$dbh->errstr; SELECT * FROM snphistory WHERE snp_id=?; END my $sth_allele_freq = $dbh->prepare(<<'END') // die "Unable to prepare allele_freq statement: ".$dbh->errstr; SELECT ssl.snp_id AS snp_id, si.subsnp_id AS subsnp_id, -- ssl.substrand_reversed_flag, si.chr AS chr, si.submitted_ind_id AS submitted_ind_id, ga.chr_num AS chr_num, -- si.submitted_strand_code, -- si.allele_flag, ga.rev_flag AS rev_flag, a.allele AS allele, ra.allele AS rev_allele, p.handle AS pop_handle, p.loc_pop_id AS loc_pop_id FROM snpsubsnplink ssl JOIN subind si on ssl.subsnp_id=si.subsnp_id JOIN gtyallele ga on si.gty_id=ga.gty_id JOIN allele a on ga.fwd_allele_id = a.allele_id JOIN allele ra on a.rev_allele_id = ra.allele_id JOIN submittedindividual sind on si.submitted_ind_id=sind.submitted_ind_id JOIN population p on sind.pop_id=p.pop_id WHERE ssl.snp_id=? AND si.chr=? AND ga.rev_flag = '0' AND a.allele <> 'N'; END my $sth_rsid_info = $dbh->prepare(<<'END') // die "Unable to prepare rsid info statement: ".$dbh->errstr; SELECT * FROM cds WHERE start<=? AND stop>=? AND chr=? ORDER BY variant; END my $sth_ga_info = $dbh->prepare(<<'END') // die "Unable to prepare ga_snp info statement: ".$dbh->errstr; SELECT * FROM ga_snp WHERE snp_id=?; END my $prot_table; while () { chomp; my ($codon,$aa,$aa_med,$aa_long) = split /\t/; $prot_table->{$codon} = $aa; } my @headers = qw(chr pos id ref alt qual filter info format); while (<>) { print $_ and next if /^#/; chomp; my @row = split /\t/, $_; my %row; @row{@headers} = @row[0..$#headers]; # this is taken directly from vcf_fill_rsid; we want it for the time being anyway if ($row{id} !~ /^(?:rs|ss)?\d+(?:,\s*(?:rs|ss)?\d+)*$/) { $row{id} = find_rsid(chr => fixup_chr($row{chr}), # snps are 0 indexed pos => $row{pos} - 1, dbh => $dbh, sth => $sth, ); } if ($row{id} =~ /(rs|ss)(\d+)/) { my $type = $1; my $id = $2; $id = find_real_snp_id(id => $id, dbh => $dbh, sth => $real_snp_sth, ); if (not defined $id and is_retired_snp(id => $id, dbh => $dbh, sth => $sth_snp_history, )) { if (defined $row{chr} and defined $row{pos}) { $row{id} = undef; } } else { $row{id} = "$type$id"; } } my $mrna_information = mrna_information(dbh => $dbh, sth => $sth_mrna_ss, # these are all 1 indexed pos => $row{pos}, chr => fixup_chr($row{chr}), ref => $row{ref}, alt => $row{alt}, cds_sth => $sth_cds_ss, mrna_cds_sth => $sth_mrna_cds, ); if (keys %{$mrna_information}) { # strip out existing information in $row{info} $row{info} =~ s{(?:GENE|(?:CDS|PROT|)POS|NUM|VAR|ACC|NONSYN|(?:ALT|REF)(?:AA|CODON))\=(?:[^\;]+|"[^"]+")(?:;|$)}{}g; my @extra_info; push @extra_info,"GENE=".$mrna_information->{gene}; push @extra_info,"NONSYN=".1 if defined $mrna_information->{nonsyn} and $mrna_information->{nonsyn}; push @extra_info,"REFAA=".$mrna_information->{orig_aa} if defined $mrna_information->{orig_aa}; push @extra_info,"ALTAA=".$mrna_information->{new_aa} if defined $mrna_information->{new_aa}; push @extra_info,"REFCODON=".$mrna_information->{orig_codon} if defined $mrna_information->{orig_codon}; push @extra_info,"ALTCODON=".$mrna_information->{new_codon} if defined $mrna_information->{new_codon}; push @extra_info,"CDSPOS=".$mrna_information->{cds_pos} if defined $mrna_information->{cds_pos}; push @extra_info,"PROTPOS=".$mrna_information->{prot_pos} if defined $mrna_information->{prot_pos}; push @extra_info,"POS=".($mrna_information->{exon}?'exon':'intron'); push @extra_info,"NUM=".$mrna_information->{number}; push @extra_info,"VAR=".$mrna_information->{variant} if defined $mrna_information->{variant}; push @extra_info,"ACC=".$mrna_information->{accession}; push @extra_info,"ACCVER=".$mrna_information->{accession_ver}; push @extra_info,"PHASE=".$mrna_information->{phase} if defined $mrna_information->{phase}; $row{info} = join(';',@extra_info,$row{info}); } my $allele_freq_information = allele_freq_information(dbh => $dbh, sth => $sth_allele_freq, id => $row{id}, # these are all 1 indexed pos => $row{pos}, chr => fixup_chr($row{chr}), ref => $row{ref}, alt => $row{alt}, ); if (keys %{$allele_freq_information}) { # strip out existing information in $row{info} $row{info} =~ s{(?:REFFREQ|ALTFREQ|REFALTN)\=(?:[^\;]+|"[^"]+")(?:;|$)}{}g; my @extra_info; if (exists $allele_freq_information->{alleles} and keys %{$allele_freq_information->{alleles}}) { push @extra_info,"REFFREQ=".sprintf('%.3f',allele_freq($allele_freq_information->{alleles},$row{ref})); push @extra_info,"ALTFREQ=".sprintf('%.3f',allele_freq($allele_freq_information->{alleles},$row{alt})); push @extra_info,"REFALTN=".$allele_freq_information->{alleles}{total}; } $row{info} = join(';',@extra_info,$row{info}); } my $ga_study_information = ga_study_information(dbh => $dbh, sth => $sth_ga_info, id => $row{id}, ); if (defined $ga_study_information) { # strip out existing information in $row{info} $row{info} =~ s{(?:GASTUDY|GASTUDYSIG)\=(?:[^\;]+|"[^"]+")(?:;|$)}{}g; my @extra_info; push @extra_info,"GASTUDY=".$ga_study_information->{studies}; push @extra_info,"GASTUDYSIG=".($ga_study_information->{significant}?'TRUE':'FALSE'); $row{info} = join(';',@extra_info,$row{info}); } print join("\t",@row{@headers},$#row > $#headers?@row[@headers..$#row]:())."\n"; if ($options{verbose} > 0 and $. % 100 == 0) { print STDERR "At line $.\n"; } } sub ga_study_information { my %param = @_; return undef unless $param{id} =~ /^\d+$/; my $rv = $param{sth}->execute($param{id}) or die "Unable to execute statement properly: ".$param{dbh}->errstr; my $results = $param{sth}->fetchall_arrayref({}); if (not @{$results}) { return undef; } my $any_significant=0; my @values; for my $result (sort { if (looks_like_number($a->{pvalue})) { if (looks_like_number($b->{pvalue})) { $a->{pvalue} <=> $b->{pvalue}; } else { -1; } } elsif (looks_like_number($b->{pvalue})) { 1; } else { 0; }} @{$results}) { if ($result->{significant}) { $any_significant=1; } my $value = $result->{study_name}.':'.$result->{subpart_name}; for my $t (qw(pvalue qvalue fdr)) { next unless exists $result->{$t} and defined $result->{$t} and length $result->{$t}; my ($fl) = $t =~ m/^(.)/; $value .= ":$fl:".sprintf('%.3g',$result->{$t}); } push @values, $value; } return {significant => $any_significant, studies => join(',',@values), }; } sub mrna_information{ my %param = @_; my $rv = $param{sth}->execute($param{pos},$param{pos},$param{chr}) or die "Unable to execute statement properly: ".$param{dbh}->errstr; my $results = $param{sth}->fetchall_arrayref({}); if ($param{sth}->err) { print STDERR $param{sth}->errstr; return {}; } my $chosen_result; for my $result (@{$results}) { if (not defined $chosen_result) { $chosen_result = $result; next; } if (not defined $chosen_result->{variant} or (defined $result->{variant} and $chosen_result->{variant} > $result->{variant})) { $chosen_result = $result; next; } } if (defined $chosen_result and $chosen_result->{exon}) { # try to figure out the cds if there is mrna information and # we're in an exon my $cds_rv = $param{cds_sth}->execute($param{pos},$param{pos},$param{chr}) or die "Unable to execute statement properly: ".$param{dbh}->errstr; my $cds_results = $param{cds_sth}->fetchall_arrayref({}); if ($param{cds_sth}->err) { print STDERR $param{cds_sth}->errstr; } else { print STDERR Data::Dumper->Dump([$cds_results],[qw(cds_results)]) if $DEBUG; my $chosen_cds; for my $cds_result (@{$cds_results}) { if (not defined $chosen_cds) { $chosen_cds = $cds_result; next; } if (not defined $cds_result->{variant} or (defined $chosen_cds->{variant} and $chosen_cds->{variant} ge $cds_result->{variant})) { $chosen_cds = $cds_result; next; } } if (defined $chosen_cds and keys %{$chosen_cds}) { my $comp_offset = $chosen_cds->{complement}?-1:1; $chosen_result->{in_cds} = 1; $chosen_result->{cds_pos} = ($chosen_cds->{complement}?$chosen_cds->{stop}-$param{pos}:$param{pos}-$chosen_cds->{start}) + $chosen_cds->{position}; $chosen_result->{gene} = $chosen_cds->{gene}; $chosen_result->{pos} = $chosen_cds->{pos}; $chosen_result->{number} = $chosen_cds->{number}; $chosen_result->{variant} = $chosen_cds->{variant}; $chosen_result->{acc} = $chosen_cds->{accession}; $chosen_result->{accver} = $chosen_cds->{accession_ver}; $chosen_result->{prot_pos} = sprintf("%.1f",$chosen_result->{cds_pos}/ 3); $chosen_result->{phase} = ($chosen_cds->{phase} + $param{pos} - $chosen_cds->{start}) % 3; print STDERR Data::Dumper->Dump([$chosen_cds,$chosen_result],[qw(chosen_cds chosen_result)]) if $DEBUG; my $mrna_cds_rv = $param{mrna_cds_sth}->execute($chosen_cds->{gi}) or die "Unable to execute statement properly: ".$param{dbh}->errstr; my $mrna_cds_result = $param{mrna_cds_sth}->fetchrow_hashref(); if (defined $mrna_cds_result and keys %{$mrna_cds_result}) { print STDERR Data::Dumper->Dump([$mrna_cds_result,$chosen_result,\%param], [qw(mrna_cds_result chosen_result *param)]) if $DEBUG; my $codon = substr $mrna_cds_result->{sequence},$chosen_result->{cds_pos} - (($chosen_result->{cds_pos}) % 3) ,3; my $orig_aa = convert_to_prot_seq($codon); my $modified_codon = $codon; substr($modified_codon,$chosen_result->{cds_pos} % 3,1)= ($chosen_cds->{complement}?dna_complement($param{alt}):$param{alt}); my $new_aa = convert_to_prot_seq($modified_codon); print STDERR Dumper($mrna_cds_result,$chosen_result,$chosen_cds,{codon_pos => $chosen_result->{cds_pos} - ($chosen_result->{cds_pos} % 3), codon=>$codon,modified_codon=>$modified_codon,pos=>$param{pos}}) if $DEBUG; if ($DEBUG > 1) { my $a = -1; print STDERR map {$a++; ($a*3)." $_\n"} $mrna_cds_result->{sequence} =~ m/.../g; } $chosen_result->{orig_aa} = $orig_aa; $chosen_result->{new_aa} = $new_aa; $chosen_result->{new_codon} = $modified_codon; $chosen_result->{orig_codon} = $codon; $chosen_result->{nonsyn} = $new_aa ne $orig_aa ? 1 : 0; } } } } return $chosen_result // {}; } sub find_real_snp_id { my %param = @_; my %info; my $rv = $param{sth}->execute($param{id},$param{id}) // die "Unable to execute statement properly: ".$param{dbh}->errstr; my ($real_id) = map {ref $_ ?@{$_}:()} map {ref $_ ?@{$_}:()} $param{sth}->fetchall_arrayref([0]); if ($param{sth}->err) { print STDERR $param{sth}->errstr; $param{sth}->finish; return $param{id}; } $param{sth}->finish; return $real_id // $param{id}; } sub is_retired_snp { my %param = @_; my %info; my $rv = $param{sth}->execute($param{id}) // die "Unable to execute statement properly: ".$param{dbh}->errstr; if ($param{sth}->rows >= 1) { return 1; } return 0; } sub fixup_chr { my ($chr) = @_; $chr =~ s/^NC_00000?//; $chr =~ s/^chr//; if (looks_like_number($chr)) { if ($chr == 23) { $chr = "X" } elsif ($chr == 24) { $chr = "Y" } } return ($chr); } sub allele_freq_information{ my %param = @_; return {} if $param{id} !~ /^\d+$/; my $rv = $param{sth}->execute($param{id},$param{chr}) or die "Unable to execute statement properly: ".$param{dbh}->errstr." id:".$param{id}." chr:".$param{chr}; my $results = $param{sth}->fetchall_arrayref({}); if ($param{sth}->err) { print STDERR $param{sth}->errstr; return {}; } my %pop_alleles; my %alleles; my %seen_indiv; my %indivs; for my $result (@{$results}) { next if exists $seen_indiv{$result->{submitted_ind_id}.'.'.$result->{chr_num}}; $indivs{$result->{submitted_ind_id}}{pop_handle} = $result->{pop_handle}; $indivs{$result->{submitted_ind_id}}{loc_pop_id} = $result->{loc_pop_id}; $indivs{$result->{submitted_ind_id}}{'chr_'.$result->{chr_num}} = $result->{allele}; if (defined $result->{loc_pop_id}) { $pop_alleles{$result->{loc_pop_id}}{$result->{allele}}++; $pop_alleles{$result->{loc_pop_id}}{total}++; } $alleles{$result->{allele}}++; $alleles{total}++; $seen_indiv{$result->{submitted_ind_id}.'.'.$result->{chr_num}} = 1; } my %genotypes; my %pop_genotypes; for my $indiv (values %indivs) { if (defined $indiv->{chr_1} and defined $indiv->{chr_2}) { my $genotype = join('/', sort($indiv->{chr_1}, $indiv->{chr_2})); $pop_genotypes{$indiv->{loc_pop_id}}{$genotype}++; $pop_genotypes{$indiv->{loc_pop_id}}{total}++; $genotypes{$genotype}++; $genotypes{total}++; } } return {pop_alleles => \%pop_alleles, pop_genotypes => \%pop_genotypes, genotypes => \%genotypes, alleles => \%alleles, }; } sub allele_freq{ my ($freq_table,$allele) = @_; if (defined $allele) { return ((exists $freq_table->{$allele} and defined $freq_table->{$allele}) ? $freq_table->{$allele}/$freq_table->{total} : 0); } my @alleles = grep {$_ ne 'total'} keys %{$freq_table}; return join(',',map {$_.':'.(defined $freq_table->{$_} ? $freq_table->{$_}/$freq_table->{total} : 0)} @alleles); } sub find_rsid { my %param = @_; my $rv = $param{sth}->execute($param{chr},$param{pos}) // die "Unable to execute statement properly: ".$param{dbh}->errstr; my @snp_ids = grep {$_ =~ /^\d+$/} map {ref $_ eq 'ARRAY'? @{$_}:$_} map {ref $_ eq 'ARRAY'? @{$_}:$_} $sth->fetchall_arrayref([0]); if ($sth->err) { print STDERR $sth->errstr; return '?'; } if (not @snp_ids) { return '.'; } return join(',',@snp_ids); } sub dna_complement{ my ($seq) = @_; return join('',map {tr /ATCG/TAGC/; $_} reverse split //,$seq); } sub convert_to_prot_seq{ my ($mrna) = @_; $mrna =~ s/U/T/g; my $prot = ''; for my $pos (0..(length($mrna) / 3 - 1)) { my $codon = substr($mrna,$pos*3,3); if (not exists $prot_table->{$codon}) { # this isn't quite right, as we should really be handling # ambiguities which do not result in different amino # acids, but for the time being, do it this way. $prot .= 'N'; } else { $prot .= $prot_table->{$codon}; } } return $prot; } __DATA__ ATT I Ile Isoleucine ATC I Ile Isoleucine ATA I Ile Isoleucine CTT L Leu Leucine CTC L Leu Leucine CTA L Leu Leucine CTG L Leu Leucine TTA L Leu Leucine TTG L Leu Leucine GTT V Val Valine GTC V Val Valine GTA V Val Valine GTG V Val Valine TTT F Phe Phenylalanine TTC F Phe Phenylalanine ATG M Met Methionine TGT C Cys Cysteine TGC C Cys Cysteine GCT A Ala Alanine GCC A Ala Alanine GCA A Ala Alanine GCG A Ala Alanine GGT G Gly Glycine GGC G Gly Glycine GGA G Gly Glycine GGG G Gly Glycine CCT P Pro Proline CCC P Pro Proline CCA P Pro Proline CCG P Pro Proline ACT T Thr Threonine ACC T Thr Threonine ACA T Thr Threonine ACG T Thr Threonine TCT S Ser Serine TCC S Ser Serine TCA S Ser Serine TCG S Ser Serine AGT S Ser Serine AGC S Ser Serine TAT Y Tyr Tyrosine TAC Y Tyr Tyrosine TGG W Trp Tryptophan CAA Q Gln Glutamine CAG Q Gln Glutamine AAT N Asn Asparagine AAC N Asn Asparagine CAT H His Histidine CAC H His Histidine GAA E Glu Glutamic acid GAG E Glu Glutamic acid GAT D Asp Aspartic acid GAC D Asp Aspartic acid AAA K Lys Lysine AAG K Lys Lysine CGT R Arg Arginine CGC R Arg Arginine CGA R Arg Arginine CGG R Arg Arginine AGA R Arg Arginine AGG R Arg Arginine TAA * * Stop codon TAG * * Stop codon TGA * * Stop codon