#!/usr/bin/perl # snp_info outputs additional snp information is released under the # terms of the GPL version 2, or any later version, at your option. # See the file README and COPYING for more information. # Copyright 2011 by Don Armstrong . use warnings; use strict; BEGIN {$|=1}; use Getopt::Long; use Pod::Usage; =head1 NAME snp_info - output snp information =head1 SYNOPSIS [options] Options: --debug, -d debugging level (Default 0) --help, -h display this help --man, -m display manual =head1 OPTIONS =over =item B<--nearest-gene> Return the name of the nearest gene if the SNP isn't in a gene =item B<--debug, -d> Debug verbosity. (Default 0) =item B<--help, -h> Display brief usage information. =item B<--man, -m> Display this manual. =back =head1 EXAMPLES echo 'rs1782034' | snp_info; =cut use vars qw($DEBUG); use DBI; use Data::Dumper; use Scalar::Util qw(looks_like_number); my %options = (debug => 0, help => 0, man => 0, verbose => 0, fix_pos => 1, nearest_gene => 0, build => '135', refseq => '37_3', ); GetOptions(\%options, 'verbose|v+', 'fix_pos|fix-pos!', 'nearest_gene|nearest-gene!', 'debug|d+','help|h|?','man|m'); pod2usage() if $options{help}; pod2usage({verbose=>2}) if $options{man}; $DEBUG = $options{debug}; my @USAGE_ERRORS; # if (1) { # push @USAGE_ERRORS,"You must pass something"; # } pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS; my $dbh = DBI->connect("dbi:Pg:service=snp",'','',{AutoCommit => 0}) or die "Unable to connect to database: ".$DBI::errstr; my $real_snp_sth = $dbh->prepare(<<'END') // die "Unable to prepare realsnp statement: ".$dbh->errstr; SELECT DISTINCT rscurrent FROM rsmergearch WHERE rslow=? OR rshigh=? END my $sth_snp_history = $dbh->prepare(<<'END') // die "Unable to prepare snp_history statement: ".$dbh->errstr; SELECT * FROM snphistory WHERE snp_id=?; END my $snp_chr_pos_sth = $dbh->prepare(<<"END") // die "Unable to prepare snpinfo statement: ".$dbh->errstr; SELECT scpr.snp_id AS snp_id, scpr.chr AS chr, scpr.pos AS pos, scl.phys_pos_from AS pos2, scpr.orien AS orien, scl.allele AS ref, uv.var_str AS var_str, ruv.var_str AS rev_var_str, ml.locus_id AS uid, ml.locus_symbol AS symbol, gitn.gene_name AS description, scl.aln_quality AS alignment_quality FROM snp s JOIN b$options{build}_snpchrposonref_$options{refseq} scpr ON s.snp_id=scpr.snp_id JOIN b$options{build}_snpcontigloc_$options{refseq} scl ON scpr.snp_id=scl.snp_id JOIN b$options{build}_contiginfo_$options{refseq} ci ON scl.ctg_id = ci.ctg_id LEFT OUTER JOIN b$options{build}_snpcontiglocusid_$options{refseq} ml ON s.snp_id=ml.snp_id LEFT OUTER JOIN geneidtoname gitn ON ml.locus_id=gitn.gene_id JOIN univariation uv ON s.univar_id=uv.univar_id JOIN univariation ruv ON uv.rev_univar_id=ruv.univar_id WHERE (ci.group_term LIKE 'GRCh%' OR ci.group_term='Primary_Assembly') AND s.snp_id=? ORDER BY scl.aln_quality; END my $sth = $dbh->prepare(<<"END") // die "Unable to prepare snpinfo statement: ".$dbh->errstr; SELECT snp_id FROM b$options{build}_snpchrposonref_$options{refseq} WHERE chr=? AND pos=?; END my $sth_mrna_ss = $dbh->prepare(<<'END') // die "Unable to prepare mrna_ss statement: ".$dbh->errstr; SELECT * FROM mrna WHERE start<=? AND stop>=? AND chr=? ORDER BY variant; END my $sth_cds_ss = $dbh->prepare(<<'END') // die "Unable to prepare cds_ss statement: ".$dbh->errstr; SELECT * FROM cds WHERE start<=? AND stop>=? AND chr=? ORDER BY variant; END my $sth_mrna_cds = $dbh->prepare(<<'END') // die "Unable to prepare mrna_cds statement: ".$dbh->errstr; SELECT * FROM mrna_cds_table WHERE cds_gi=?; END my $sth_allele_freq = $dbh->prepare(<<'END') // die "Unable to prepare allele_freq statement: ".$dbh->errstr; SELECT ssl.snp_id AS snp_id, si.subsnp_id AS subsnp_id, si.chr AS chr, si.submitted_ind_id AS submitted_ind_id, ga.chr_num AS chr_num, ga.rev_flag AS rev_flag, a.allele AS allele FROM snpsubsnplink ssl JOIN subind si on ssl.subsnp_id=si.subsnp_id JOIN gtyallele ga on si.gty_id=ga.gty_id JOIN allele a on ga.fwd_allele_id = a.allele_id WHERE ssl.snp_id=? AND si.chr=? AND ga.rev_flag = '0' AND a.allele <> 'N'; END my $sth_rsid_info = $dbh->prepare(<<'END') // die "Unable to prepare rsid info statement: ".$dbh->errstr; SELECT * FROM cds WHERE start<=? AND stop>=? AND chr=? ORDER BY variant; END my $sth_near_gene_start = $dbh->prepare(<<'END') // die "Unable to prepare near gene start statement: ".$dbh->errstr; SELECT cds.start AS start, cds.stop AS stop, cds.gene AS symbol, gitn.gene_name AS description FROM cds LEFT OUTER JOIN geneidtoname gitn ON cds.geneid=gitn.gene_id WHERE cds.chr = ? AND cds.start > ? ORDER BY cds.start ASC LIMIT 1; END my $sth_near_gene_stop = $dbh->prepare(<<'END') // die "Unable to prepare near gene stop statement: ".$dbh->errstr; SELECT cds.start AS start, cds.stop AS stop, cds.gene AS symbol, gitn.gene_name AS description FROM cds LEFT OUTER JOIN geneidtoname gitn ON cds.geneid=gitn.gene_id WHERE cds.chr = ? AND cds.stop < ? ORDER BY cds.stop DESC LIMIT 1; END my $prot_table; while () { chomp; my ($codon,$aa,$aa_med,$aa_long) = split /\t/; $prot_table->{$codon} = $aa; } my @headers = qw(id chr pos ref alt); my @row_keys_to_initialize = qw(gene nonsyn aa_ref aa_alt codon_ref codon_alt cdspos protpos loc num var acc phase reffreq altfreq refaltn); print join("\t",@headers,'orig_id',@row_keys_to_initialize)."\n"; while (<>) { print $_ and next if /^#/; chomp; my @row = split /\t/, $_; my %row; @row{@headers} = @row[0..$#headers]; $row{orig_id} = $row{id}; # this is taken directly from vcf_fill_rsid; we want it for the time being anyway if ($row{id} !~ /^(?:rs|ss)?\d+(?:,\s*(?:rs|ss)?\d+)*$/) { my $new_id = find_rsid(chr => fixup_chr($row{chr}), # snps are 0 indexed pos => $row{pos} - 1, dbh => $dbh, sth => $sth, ); if ($new_id !~ /^(?:[\.\?])$/) { $row{id} = "rs".$new_id; } } if ($row{id} =~ /(rs|ss)(\d+)/) { my $type = $1; my $id = $2; $id = find_real_snp_id(id => $id, dbh => $dbh, sth => $real_snp_sth, ); if (not defined $id and is_retired_snp(id => $id, dbh => $dbh, sth => $sth_snp_history, )) { if (defined $row{chr} and defined $row{pos}) { $row{id} = "$row{chr}-$row{pos}"; } } else { $row{id} = "$type$id"; } } if (defined $row{id} and ($options{fix_pos} or not defined $row{pos} or not defined $row{chr} or not defined $row{ref} or not defined $row{alt})) { my %info = find_chr_pos(id => [$row{id} =~ /(?:rs|ss)(\d+)/]->[0], dbh => $dbh, sth => $snp_chr_pos_sth, ); for my $inf (qw(pos chr ref alt)) { $row{$inf} = $info{$inf} if $options{fix_pos} and defined $info{$inf} or not defined $row{$inf}; } # handle partially aligned snps if ($options{fix_pos} and not defined $row{pos}) { $row{pos} = $info{pos2}; } if (not defined $row{pos} or not defined $row{chr}) { if (defined $row{pos}) { print STDERR "$row{id} is ambiguously placed\n"; } else { print STDERR "Unable to find a position/chr for $row{id}\n"; } } } my $mrna_information = mrna_information(dbh => $dbh, sth => $sth_mrna_ss, # these are all 1 indexed pos => $row{pos}, chr => fixup_chr($row{chr}), ref => $row{ref}, alt => $row{alt}, cds_sth => $sth_cds_ss, mrna_cds_sth => $sth_mrna_cds, ); @row{@row_keys_to_initialize} = ('') x @row_keys_to_initialize; if (keys %{$mrna_information}) { # strip out existing information in $row{info} my @extra_info; $row{gene} = $mrna_information->{gene}; $row{nonsyn} = 1 if defined $mrna_information->{nonsyn} and $mrna_information->{nonsyn}; $row{aa_ref} = $mrna_information->{orig_aa} if defined $mrna_information->{orig_aa}; $row{aa_alt} = $mrna_information->{new_aa} if defined $mrna_information->{new_aa}; $row{codon_ref} = $mrna_information->{orig_codon} if defined $mrna_information->{orig_codon}; $row{codon_alt} = $mrna_information->{new_codon} if defined $mrna_information->{new_codon}; $row{cdspos} = $mrna_information->{cds_pos} if defined $mrna_information->{cds_pos}; $row{protpos} = $mrna_information->{prot_pos} if defined $mrna_information->{prot_pos}; $row{loc} = $mrna_information->{exon}?'exon':'intron' if defined $mrna_information->{gene}; $row{num} = $mrna_information->{number}; $row{var} = $mrna_information->{variant} if defined $mrna_information->{variant}; $row{acc} = $mrna_information->{accession}; $row{phase} = $mrna_information->{phase} if defined $mrna_information->{phase}; } if ((not defined $row{gene} or not length $row{gene}) and $options{nearest_gene}) { $row{gene} = find_nearest_gene(dbh => $dbh, start_sth => $sth_near_gene_start, stop_sth => $sth_near_gene_stop, chr => fixup_chr($row{chr}), pos => $row{pos}, ); } my $allele_freq_information = allele_freq_information(dbh => $dbh, sth => $sth_allele_freq, id => [$row{id} =~ /(?:rs|ss)(\d+)/]->[0], # these are all 1 indexed pos => $row{pos}, chr => fixup_chr($row{chr}), ref => $row{ref}, alt => $row{alt}, ) if defined $row{id} and $row{id} =~ /^(?:rs|ss)/; if (keys %{$allele_freq_information}) { # strip out existing information in $row{info} if (exists $allele_freq_information->{alleles} and keys %{$allele_freq_information->{alleles}}) { $row{reffreq} = sprintf('%.3f',allele_freq($allele_freq_information->{alleles},$row{ref})); $row{altfreq} = sprintf('%.3f',allele_freq($allele_freq_information->{alleles},$row{alt})); $row{refaltn} = $allele_freq_information->{alleles}{total}; } } print join("\t",map {defined $_?$_:'NA'} @row{@headers,'orig_id',@row_keys_to_initialize},$#row > $#headers?@row[@headers..$#row]:())."\n"; if ($options{verbose} > 0 and $. % 100 == 0) { print STDERR "At line $.\n"; } } sub find_real_snp_id { my %param = @_; my %info; my $rv = $param{sth}->execute($param{id},$param{id}) // die "Unable to execute statement properly: ".$param{dbh}->errstr; my ($real_id) = map {ref $_ ?@{$_}:()} map {ref $_ ?@{$_}:()} $param{sth}->fetchall_arrayref([0]); if ($param{sth}->err) { print STDERR $param{sth}->errstr; $param{sth}->finish; return $param{id}; } $param{sth}->finish; return $real_id // $param{id}; } sub find_chr_pos { my %param = @_; my %info; my $rv = $param{sth}->execute($param{id}) // die "Unable to execute statement properly: ".$param{dbh}->errstr; my $results = $param{sth}->fetchall_arrayref({}); if ($param{sth}->err) { print STDERR $param{sth}->errstr; $param{sth}->finish; return %info; } for my $result (@{$results}) { %info = %{$result}; } if (keys %info) { if ($info{chr} eq 'NotOn') { undef $info{chr}; } if ($info{orien}) { $info{orien_var_str} = $info{rev_var_str}; } else { $info{orien_var_str} = $info{var_str}; } my @alleles = split qr{/},$info{orien_var_str}; ($info{alt}) = grep {$_ ne $info{ref}} @alleles; # fix $row{pos} $info{pos}+=1; } $param{sth}->finish; return %info; } sub find_nearest_gene { my %param = @_; return undef unless defined $param{chr} and defined $param{pos}; my %possible_genes; for my $st (qw(start stop)) { my $rv = $param{"${st}_sth"}->execute($param{chr},$param{pos}) // die "Unable to execute nearest gene $st statement properly: ".$param{dbh}->errstr; my $results = $param{"${st}_sth"}->fetchall_arrayref({}); if ($param{"${st}_sth"}->err) { print STDERR $param{"${st}_sth"}->errstr; $param{"${st}_sth"}->finish; return ''; } $possible_genes{$st} = $results->[0] } for my $st (qw(start stop)) { # opposite of start/stop (stop/start) my $st_op = $st eq q(start) ? q(stop) : q(start); if (not defined $possible_genes{$st}{start}) { if (defined $possible_genes{$st_op}{start}) { return $possible_genes{$st_op}{symbol} } else { return undef; } } } my $gene; # if the start of the start gene is closer to the position of the # snp than the stop of the stop gene, it's the nearest gene; # otherwise, the stop gene is if ($possible_genes{start}{start} - $param{pos} < $param{pos} - $possible_genes{stop}{stop} ) { return $possible_genes{start}{symbol}; } else { return $possible_genes{stop}{symbol}; } } sub mrna_information{ my %param = @_; my $rv = $param{sth}->execute($param{pos},$param{pos},$param{chr}) or die "Unable to execute statement properly: ".$param{dbh}->errstr; my $results = $param{sth}->fetchall_arrayref({}); if ($param{sth}->err) { print STDERR $param{sth}->errstr; return {}; } my $chosen_result; for my $result (@{$results}) { if (not defined $chosen_result) { $chosen_result = $result; next; } if (not defined $chosen_result->{variant} or (defined $result->{variant} and $chosen_result->{variant} > $result->{variant})) { $chosen_result = $result; next; } } if (defined $chosen_result and $chosen_result->{exon}) { # try to figure out the cds if there is mrna information and # we're in an exon my $cds_rv = $param{cds_sth}->execute($param{pos},$param{pos},$param{chr}) or die "Unable to execute statement properly: ".$param{dbh}->errstr; my $cds_results = $param{cds_sth}->fetchall_arrayref({}); if ($param{cds_sth}->err) { print STDERR $param{cds_sth}->errstr; } else { print STDERR Data::Dumper->Dump([$cds_results],[qw(cds_results)]) if $DEBUG; my $chosen_cds; for my $cds_result (@{$cds_results}) { if (not defined $chosen_cds) { $chosen_cds = $cds_result; next; } if (not defined $cds_result->{variant} or (defined $chosen_cds->{variant} and $chosen_cds->{variant} ge $cds_result->{variant})) { $chosen_cds = $cds_result; next; } } if (defined $chosen_cds and keys %{$chosen_cds}) { my $comp_offset = $chosen_cds->{complement}?-1:1; $chosen_result->{in_cds} = 1; $chosen_result->{cds_pos} = ($chosen_cds->{complement}?$chosen_cds->{stop}-$param{pos}:$param{pos}-$chosen_cds->{start}) + $chosen_cds->{position}; $chosen_result->{gene} = $chosen_cds->{gene}; $chosen_result->{pos} = $chosen_cds->{pos}; $chosen_result->{number} = $chosen_cds->{number}; $chosen_result->{variant} = $chosen_cds->{variant}; $chosen_result->{acc} = $chosen_cds->{accession}; $chosen_result->{prot_pos} = sprintf("%.1f",$chosen_result->{cds_pos}/ 3); $chosen_result->{phase} = ($chosen_cds->{phase} + $param{pos} - $chosen_cds->{start}) % 3; print STDERR Data::Dumper->Dump([$chosen_cds,$chosen_result],[qw(chosen_cds chosen_result)]) if $DEBUG; my $mrna_cds_rv = $param{mrna_cds_sth}->execute($chosen_cds->{gi}) or die "Unable to execute statement properly: ".$param{dbh}->errstr; my $mrna_cds_result = $param{mrna_cds_sth}->fetchrow_hashref(); if (defined $mrna_cds_result and keys %{$mrna_cds_result}) { print STDERR Data::Dumper->Dump([$mrna_cds_result,$chosen_result,\%param], [qw(mrna_cds_result chosen_result *param)]) if $DEBUG; my $codon = substr $mrna_cds_result->{sequence},$chosen_result->{cds_pos} - (($chosen_result->{cds_pos}) % 3) ,3; my $orig_aa = convert_to_prot_seq($codon); my $modified_codon = $codon; substr($modified_codon,$chosen_result->{cds_pos} % 3,1)= ($chosen_cds->{complement}?dna_complement($param{alt}):$param{alt}); my $new_aa = convert_to_prot_seq($modified_codon); print STDERR Dumper($mrna_cds_result,$chosen_result,$chosen_cds,{codon_pos => $chosen_result->{cds_pos} - ($chosen_result->{cds_pos} % 3), codon=>$codon,modified_codon=>$modified_codon,pos=>$param{pos}}) if $DEBUG; if ($DEBUG > 1) { my $a = -1; print STDERR map {$a++; ($a*3)." $_\n"} $mrna_cds_result->{sequence} =~ m/.../g; } $chosen_result->{orig_aa} = $orig_aa; $chosen_result->{new_aa} = $new_aa; $chosen_result->{new_codon} = $modified_codon; $chosen_result->{orig_codon} = $codon; $chosen_result->{nonsyn} = $new_aa ne $orig_aa ? 1 : 0; } } } } return $chosen_result // {}; } sub fixup_chr { my ($chr) = @_; return $chr if not defined $chr; $chr =~ s/^NC_00000?//; $chr =~ s/^chr//; if (looks_like_number($chr)) { if ($chr == 23) { $chr = "X" } elsif ($chr == 24) { $chr = "Y" } } return ($chr); } sub allele_freq_information{ my %param = @_; return {} if $param{id} !~ /^\d+$/; my $rv = $param{sth}->execute($param{id},$param{chr}) or die "Unable to execute statement properly: ".$param{dbh}->errstr." id:".$param{id}." chr:".$param{chr}; my $results = $param{sth}->fetchall_arrayref({}); if ($param{sth}->err) { print STDERR $param{sth}->errstr; return {}; } my %pop_alleles; my %alleles; my %seen_indiv; my %indivs; for my $result (@{$results}) { next if exists $seen_indiv{$result->{submitted_ind_id}.'.'.$result->{chr_num}}; $indivs{$result->{submitted_ind_id}}{'chr_'.$result->{chr_num}} = $result->{allele}; if (defined $result->{loc_pop_id}) { $indivs{$result->{submitted_ind_id}}{loc_pop_id} = $result->{loc_pop_id}; $indivs{$result->{submitted_ind_id}}{pop_handle} = $result->{pop_handle}; $pop_alleles{$result->{loc_pop_id}}{$result->{allele}}++; $pop_alleles{$result->{loc_pop_id}}{total}++; } $alleles{$result->{allele}}++; $alleles{total}++; $seen_indiv{$result->{submitted_ind_id}.'.'.$result->{chr_num}} = 1; } my %genotypes; my %pop_genotypes; for my $indiv (values %indivs) { if (defined $indiv->{chr_1} and defined $indiv->{chr_2}) { my $genotype = join('/', sort($indiv->{chr_1}, $indiv->{chr_2})); if (defined $indiv->{loc_pop_id}) { $pop_genotypes{$indiv->{loc_pop_id}}{$genotype}++; $pop_genotypes{$indiv->{loc_pop_id}}{total}++; } $genotypes{$genotype}++; $genotypes{total}++; } } return {pop_alleles => \%pop_alleles, pop_genotypes => \%pop_genotypes, genotypes => \%genotypes, alleles => \%alleles, }; } sub allele_freq{ my ($freq_table,$allele) = @_; my $flipped_allele = $allele; $flipped_allele =~ tr/ACTG/TGAC/; if (defined $allele) { if (exists $freq_table->{$allele} and defined $freq_table->{$allele}) { return $freq_table->{$allele}/$freq_table->{total}; } elsif (exists $freq_table->{$flipped_allele} and defined $freq_table->{$flipped_allele}) { return $freq_table->{$flipped_allele}/$freq_table->{total}; } return 0; } my @alleles = grep {$_ ne 'total'} keys %{$freq_table}; return join(',',map {$_.':'.(defined $freq_table->{$_} ? $freq_table->{$_}/$freq_table->{total} : 0)} @alleles); } sub find_rsid { my %param = @_; my $rv = $param{sth}->execute($param{chr},$param{pos}) // die "Unable to execute statement properly: ".$param{dbh}->errstr; my @snp_ids = grep {$_ =~ /^\d+$/} map {ref $_ eq 'ARRAY'? @{$_}:$_} map {ref $_ eq 'ARRAY'? @{$_}:$_} $sth->fetchall_arrayref([0]); if ($sth->err) { print STDERR $sth->errstr; return '?'; } if (not @snp_ids) { return '.'; } return join(',',@snp_ids); } sub dna_complement{ my ($seq) = @_; return join('',map {tr /ATCG/TAGC/; $_} reverse split //,$seq); } sub convert_to_prot_seq{ my ($mrna) = @_; $mrna =~ s/U/T/g; my $prot = ''; for my $pos (0..(length($mrna) / 3 - 1)) { my $codon = substr($mrna,$pos*3,3); if (not exists $prot_table->{$codon}) { # this isn't quite right, as we should really be handling # ambiguities which do not result in different amino # acids, but for the time being, do it this way. $prot .= 'N'; } else { $prot .= $prot_table->{$codon}; } } return $prot; } __DATA__ ATT I Ile Isoleucine ATC I Ile Isoleucine ATA I Ile Isoleucine CTT L Leu Leucine CTC L Leu Leucine CTA L Leu Leucine CTG L Leu Leucine TTA L Leu Leucine TTG L Leu Leucine GTT V Val Valine GTC V Val Valine GTA V Val Valine GTG V Val Valine TTT F Phe Phenylalanine TTC F Phe Phenylalanine ATG M Met Methionine TGT C Cys Cysteine TGC C Cys Cysteine GCT A Ala Alanine GCC A Ala Alanine GCA A Ala Alanine GCG A Ala Alanine GGT G Gly Glycine GGC G Gly Glycine GGA G Gly Glycine GGG G Gly Glycine CCT P Pro Proline CCC P Pro Proline CCA P Pro Proline CCG P Pro Proline ACT T Thr Threonine ACC T Thr Threonine ACA T Thr Threonine ACG T Thr Threonine TCT S Ser Serine TCC S Ser Serine TCA S Ser Serine TCG S Ser Serine AGT S Ser Serine AGC S Ser Serine TAT Y Tyr Tyrosine TAC Y Tyr Tyrosine TGG W Trp Tryptophan CAA Q Gln Glutamine CAG Q Gln Glutamine AAT N Asn Asparagine AAC N Asn Asparagine CAT H His Histidine CAC H His Histidine GAA E Glu Glutamic acid GAG E Glu Glutamic acid GAT D Asp Aspartic acid GAC D Asp Aspartic acid AAA K Lys Lysine AAG K Lys Lysine CGT R Arg Arginine CGC R Arg Arginine CGA R Arg Arginine CGG R Arg Arginine AGA R Arg Arginine AGG R Arg Arginine TAA * * Stop codon TAG * * Stop codon TGA * * Stop codon