--- /dev/null
+#!/usr/bin/perl
+# snp_info outputs additional snp information is released under the
+# terms of the GPL version 2, or any later version, at your option.
+# See the file README and COPYING for more information.
+# Copyright 2011 by Don Armstrong <don@donarmstrong.com>.
+
+
+use warnings;
+use strict;
+
+BEGIN {$|=1};
+
+use Getopt::Long;
+use Pod::Usage;
+
+=head1 NAME
+
+snp_info - output snp information
+
+=head1 SYNOPSIS
+
+ [options]
+
+ Options:
+ --debug, -d debugging level (Default 0)
+ --help, -h display this help
+ --man, -m display manual
+
+=head1 OPTIONS
+
+=over
+
+=item B<--nearest-gene>
+
+Return the name of the nearest gene if the SNP isn't in a gene
+
+=item B<--debug, -d>
+
+Debug verbosity. (Default 0)
+
+=item B<--help, -h>
+
+Display brief usage information.
+
+=item B<--man, -m>
+
+Display this manual.
+
+=back
+
+=head1 EXAMPLES
+
+echo 'rs1782034' | snp_info;
+
+=cut
+
+
+use vars qw($DEBUG);
+use DBI;
+use Data::Dumper;
+use Scalar::Util qw(looks_like_number);
+
+
+my %options = (debug => 0,
+ help => 0,
+ man => 0,
+ verbose => 0,
+ fix_pos => 1,
+ nearest_gene => 0,
+ build => '135',
+ refseq => '37_3',
+ );
+
+GetOptions(\%options,
+ 'verbose|v+',
+ 'fix_pos|fix-pos!',
+ 'nearest_gene|nearest-gene!',
+ 'debug|d+','help|h|?','man|m');
+
+pod2usage() if $options{help};
+pod2usage({verbose=>2}) if $options{man};
+
+$DEBUG = $options{debug};
+
+my @USAGE_ERRORS;
+# if (1) {
+# push @USAGE_ERRORS,"You must pass something";
+# }
+
+pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
+
+
+my $dbh = DBI->connect("dbi:Pg:service=snp",'','',{AutoCommit => 0}) or
+ die "Unable to connect to database: ".$DBI::errstr;
+
+my $real_snp_sth = $dbh->prepare(<<'END') // die "Unable to prepare realsnp statement: ".$dbh->errstr;
+SELECT DISTINCT rscurrent FROM rsmergearch WHERE rslow=? OR rshigh=?
+END
+
+my $sth_snp_history = $dbh->prepare(<<'END') // die "Unable to prepare snp_history statement: ".$dbh->errstr;
+SELECT * FROM snphistory WHERE snp_id=?;
+END
+
+my $snp_chr_pos_sth = $dbh->prepare(<<"END") // die "Unable to prepare snpinfo statement: ".$dbh->errstr;
+SELECT scpr.snp_id AS snp_id,
+ scpr.chr AS chr,
+ scpr.pos AS pos,
+ scl.phys_pos_from AS pos2,
+ scpr.orien AS orien,
+ scl.allele AS ref,
+ uv.var_str AS var_str,
+ ruv.var_str AS rev_var_str,
+ ml.locus_id AS uid,
+ ml.locus_symbol AS symbol,
+ gitn.gene_name AS description,
+ scl.aln_quality AS alignment_quality
+ FROM snp s
+ JOIN b$options{build}_snpchrposonref_$options{refseq} scpr ON s.snp_id=scpr.snp_id
+ JOIN b$options{build}_snpcontigloc_$options{refseq} scl ON scpr.snp_id=scl.snp_id
+ JOIN b$options{build}_contiginfo_$options{refseq} ci ON scl.ctg_id = ci.ctg_id
+ LEFT OUTER JOIN b$options{build}_snpcontiglocusid_$options{refseq} ml ON s.snp_id=ml.snp_id
+ LEFT OUTER JOIN geneidtoname gitn ON ml.locus_id=gitn.gene_id
+ JOIN univariation uv ON s.univar_id=uv.univar_id
+ JOIN univariation ruv ON uv.rev_univar_id=ruv.univar_id
+WHERE (ci.group_term LIKE 'GRCh%' OR ci.group_term='Primary_Assembly') AND s.snp_id=? ORDER BY scl.aln_quality;
+END
+
+my $sth = $dbh->prepare(<<"END") // die "Unable to prepare snpinfo statement: ".$dbh->errstr;
+SELECT snp_id FROM b$options{build}_snpchrposonref_$options{refseq} WHERE chr=? AND pos=?;
+END
+
+my $sth_mrna_ss = $dbh->prepare(<<'END') // die "Unable to prepare mrna_ss statement: ".$dbh->errstr;
+SELECT * FROM mrna WHERE start<=? AND stop>=? AND chr=? ORDER BY variant;
+END
+
+my $sth_cds_ss = $dbh->prepare(<<'END') // die "Unable to prepare cds_ss statement: ".$dbh->errstr;
+SELECT * FROM cds WHERE start<=? AND stop>=? AND chr=? ORDER BY variant;
+END
+
+my $sth_mrna_cds = $dbh->prepare(<<'END') // die "Unable to prepare mrna_cds statement: ".$dbh->errstr;
+SELECT * FROM mrna_cds_table WHERE cds_gi=?;
+END
+
+my $sth_allele_freq = $dbh->prepare(<<'END') // die "Unable to prepare allele_freq statement: ".$dbh->errstr;
+SELECT ssl.snp_id AS snp_id,
+ si.subsnp_id AS subsnp_id,
+ si.chr AS chr,
+ si.submitted_ind_id AS submitted_ind_id,
+ ga.chr_num AS chr_num,
+ ga.rev_flag AS rev_flag,
+ a.allele AS allele
+ FROM snpsubsnplink ssl
+ JOIN subind si on ssl.subsnp_id=si.subsnp_id
+ JOIN gtyallele ga on si.gty_id=ga.gty_id
+ JOIN allele a on ga.fwd_allele_id = a.allele_id
+ WHERE ssl.snp_id=? AND
+ si.chr=? AND ga.rev_flag = '0' AND
+ a.allele <> 'N';
+END
+
+my $sth_rsid_info = $dbh->prepare(<<'END') // die "Unable to prepare rsid info statement: ".$dbh->errstr;
+SELECT * FROM cds WHERE start<=? AND stop>=? AND chr=? ORDER BY variant;
+END
+
+my $sth_near_gene_start = $dbh->prepare(<<'END') // die "Unable to prepare near gene start statement: ".$dbh->errstr;
+SELECT cds.start AS start,
+ cds.stop AS stop,
+ cds.gene AS symbol,
+ gitn.gene_name AS description
+ FROM cds
+ LEFT OUTER JOIN geneidtoname gitn ON cds.geneid=gitn.gene_id
+ WHERE cds.chr = ? AND cds.start > ?
+ ORDER BY cds.start ASC LIMIT 1;
+END
+
+my $sth_near_gene_stop = $dbh->prepare(<<'END') // die "Unable to prepare near gene stop statement: ".$dbh->errstr;
+SELECT cds.start AS start,
+ cds.stop AS stop,
+ cds.gene AS symbol,
+ gitn.gene_name AS description
+ FROM cds
+ LEFT OUTER JOIN geneidtoname gitn ON cds.geneid=gitn.gene_id
+ WHERE cds.chr = ? AND cds.stop < ?
+ ORDER BY cds.stop DESC LIMIT 1;
+END
+
+my $prot_table;
+
+while (<DATA>) {
+ chomp;
+ my ($codon,$aa,$aa_med,$aa_long) = split /\t/;
+ $prot_table->{$codon} = $aa;
+}
+
+my @headers = qw(id chr pos ref alt);
+my @row_keys_to_initialize = qw(gene nonsyn aa_ref aa_alt codon_ref codon_alt cdspos protpos loc num var acc phase reffreq altfreq refaltn);
+print join("\t",@headers,'orig_id',@row_keys_to_initialize)."\n";
+while (<>) {
+ print $_ and next if /^#/;
+ chomp;
+ my @row = split /\t/, $_;
+ my %row;
+ @row{@headers} = @row[0..$#headers];
+ $row{orig_id} = $row{id};
+ # this is taken directly from vcf_fill_rsid; we want it for the time being anyway
+ if ($row{id} !~ /^(?:rs|ss)?\d+(?:,\s*(?:rs|ss)?\d+)*$/) {
+ my $new_id = find_rsid(chr => fixup_chr($row{chr}),
+ # snps are 0 indexed
+ pos => $row{pos} - 1,
+ dbh => $dbh,
+ sth => $sth,
+ );
+ if ($new_id !~ /^(?:[\.\?])$/) {
+ $row{id} = "rs".$new_id;
+ }
+ }
+ if ($row{id} =~ /(rs|ss)(\d+)/) {
+ my $type = $1;
+ my $id = $2;
+ $id = find_real_snp_id(id => $id,
+ dbh => $dbh,
+ sth => $real_snp_sth,
+ );
+ if (not defined $id and
+ is_retired_snp(id => $id,
+ dbh => $dbh,
+ sth => $sth_snp_history,
+ )) {
+ if (defined $row{chr} and defined $row{pos}) {
+ $row{id} = "$row{chr}-$row{pos}";
+ }
+ }
+ else {
+ $row{id} = "$type$id";
+ }
+ }
+
+ if (defined $row{id} and ($options{fix_pos} or
+ not defined $row{pos} or
+ not defined $row{chr} or
+ not defined $row{ref} or
+ not defined $row{alt})) {
+ my %info = find_chr_pos(id => [$row{id} =~ /(?:rs|ss)(\d+)/]->[0],
+ dbh => $dbh,
+ sth => $snp_chr_pos_sth,
+ );
+ for my $inf (qw(pos chr ref alt)) {
+ $row{$inf} = $info{$inf} if $options{fix_pos} and defined $info{$inf} or not defined $row{$inf};
+ }
+ # handle partially aligned snps
+ if ($options{fix_pos} and not defined $row{pos}) {
+ $row{pos} = $info{pos2};
+ }
+ if (not defined $row{pos} or not defined $row{chr}) {
+ if (defined $row{pos}) {
+ print STDERR "$row{id} is ambiguously placed\n";
+ }
+ else {
+ print STDERR "Unable to find a position/chr for $row{id}\n";
+ }
+ }
+ }
+ my $mrna_information = mrna_information(dbh => $dbh,
+ sth => $sth_mrna_ss,
+ # these are all 1 indexed
+ pos => $row{pos},
+ chr => fixup_chr($row{chr}),
+ ref => $row{ref},
+ alt => $row{alt},
+ cds_sth => $sth_cds_ss,
+ mrna_cds_sth => $sth_mrna_cds,
+ );
+ @row{@row_keys_to_initialize} = ('') x @row_keys_to_initialize;
+ if (keys %{$mrna_information}) {
+ # strip out existing information in $row{info}
+ my @extra_info;
+ $row{gene} = $mrna_information->{gene};
+ $row{nonsyn} = 1 if defined $mrna_information->{nonsyn} and $mrna_information->{nonsyn};
+ $row{aa_ref} = $mrna_information->{orig_aa} if defined $mrna_information->{orig_aa};
+ $row{aa_alt} = $mrna_information->{new_aa} if defined $mrna_information->{new_aa};
+ $row{codon_ref} = $mrna_information->{orig_codon} if defined $mrna_information->{orig_codon};
+ $row{codon_alt} = $mrna_information->{new_codon} if defined $mrna_information->{new_codon};
+ $row{cdspos} = $mrna_information->{cds_pos} if defined $mrna_information->{cds_pos};
+ $row{protpos} = $mrna_information->{prot_pos} if defined $mrna_information->{prot_pos};
+ $row{loc} = $mrna_information->{exon}?'exon':'intron' if defined $mrna_information->{gene};
+ $row{num} = $mrna_information->{number};
+ $row{var} = $mrna_information->{variant} if defined $mrna_information->{variant};
+ $row{acc} = $mrna_information->{accession};
+ $row{phase} = $mrna_information->{phase} if defined $mrna_information->{phase};
+ }
+ if ((not defined $row{gene} or not length $row{gene})
+ and $options{nearest_gene}) {
+ $row{gene} = find_nearest_gene(dbh => $dbh,
+ start_sth => $sth_near_gene_start,
+ stop_sth => $sth_near_gene_stop,
+ chr => fixup_chr($row{chr}),
+ pos => $row{pos},
+ );
+ }
+ my $allele_freq_information = allele_freq_information(dbh => $dbh,
+ sth => $sth_allele_freq,
+ id => [$row{id} =~ /(?:rs|ss)(\d+)/]->[0],
+ # these are all 1 indexed
+ pos => $row{pos},
+ chr => fixup_chr($row{chr}),
+ ref => $row{ref},
+ alt => $row{alt},
+ ) if defined $row{id} and $row{id} =~ /^(?:rs|ss)/;
+ if (keys %{$allele_freq_information}) {
+ # strip out existing information in $row{info}
+ if (exists $allele_freq_information->{alleles} and keys %{$allele_freq_information->{alleles}}) {
+ $row{reffreq} = sprintf('%.3f',allele_freq($allele_freq_information->{alleles},$row{ref}));
+ $row{altfreq} = sprintf('%.3f',allele_freq($allele_freq_information->{alleles},$row{alt}));
+ $row{refaltn} = $allele_freq_information->{alleles}{total};
+ }
+ }
+ print join("\t",map {defined $_?$_:'NA'} @row{@headers,'orig_id',@row_keys_to_initialize},$#row > $#headers?@row[@headers..$#row]:())."\n";
+ if ($options{verbose} > 0 and $. % 100 == 0) {
+ print STDERR "At line $.\n";
+ }
+}
+
+sub find_real_snp_id {
+ my %param = @_;
+
+ my %info;
+ my $rv = $param{sth}->execute($param{id},$param{id}) //
+ die "Unable to execute statement properly: ".$param{dbh}->errstr;
+ my ($real_id) = map {ref $_ ?@{$_}:()} map {ref $_ ?@{$_}:()} $param{sth}->fetchall_arrayref([0]);
+ if ($param{sth}->err) {
+ print STDERR $param{sth}->errstr;
+ $param{sth}->finish;
+ return $param{id};
+ }
+ $param{sth}->finish;
+ return $real_id // $param{id};
+}
+
+sub find_chr_pos {
+ my %param = @_;
+
+ my %info;
+ my $rv = $param{sth}->execute($param{id}) //
+ die "Unable to execute statement properly: ".$param{dbh}->errstr;
+ my $results = $param{sth}->fetchall_arrayref({});
+ if ($param{sth}->err) {
+ print STDERR $param{sth}->errstr;
+ $param{sth}->finish;
+ return %info;
+ }
+ for my $result (@{$results}) {
+ %info = %{$result};
+ }
+ if (keys %info) {
+ if ($info{chr} eq 'NotOn') {
+ undef $info{chr};
+ }
+ if ($info{orien}) {
+ $info{orien_var_str} = $info{rev_var_str};
+ }
+ else {
+ $info{orien_var_str} = $info{var_str};
+ }
+ my @alleles = split qr{/},$info{orien_var_str};
+ ($info{alt}) = grep {$_ ne $info{ref}} @alleles;
+ # fix $row{pos}
+ $info{pos}+=1;
+ }
+ $param{sth}->finish;
+ return %info;
+}
+
+sub find_nearest_gene {
+ my %param = @_;
+ return undef unless defined $param{chr} and defined $param{pos};
+
+ my %possible_genes;
+
+ for my $st (qw(start stop)) {
+ my $rv = $param{"${st}_sth"}->execute($param{chr},$param{pos}) //
+ die "Unable to execute nearest gene $st statement properly: ".$param{dbh}->errstr;
+ my $results = $param{"${st}_sth"}->fetchall_arrayref({});
+ if ($param{"${st}_sth"}->err) {
+ print STDERR $param{"${st}_sth"}->errstr;
+ $param{"${st}_sth"}->finish;
+ return '';
+ }
+ $possible_genes{$st} = $results->[0]
+ }
+ for my $st (qw(start stop)) {
+ # opposite of start/stop (stop/start)
+ my $st_op = $st eq q(start) ? q(stop) : q(start);
+ if (not defined $possible_genes{$st}{start}) {
+ if (defined $possible_genes{$st_op}{start}) {
+ return $possible_genes{$st_op}{symbol}
+ }
+ else {
+ return undef;
+ }
+ }
+ }
+ my $gene;
+ # if the start of the start gene is closer to the position of the
+ # snp than the stop of the stop gene, it's the nearest gene;
+ # otherwise, the stop gene is
+ if ($possible_genes{start}{start} - $param{pos} <
+ $param{pos} - $possible_genes{stop}{stop}
+ ) {
+ return $possible_genes{start}{symbol};
+ }
+ else {
+ return $possible_genes{stop}{symbol};
+ }
+}
+
+
+sub mrna_information{
+ my %param = @_;
+
+ my $rv = $param{sth}->execute($param{pos},$param{pos},$param{chr}) or
+ die "Unable to execute statement properly: ".$param{dbh}->errstr;
+
+ my $results = $param{sth}->fetchall_arrayref({});
+ if ($param{sth}->err) {
+ print STDERR $param{sth}->errstr;
+ return {};
+ }
+ my $chosen_result;
+ for my $result (@{$results}) {
+ if (not defined $chosen_result) {
+ $chosen_result = $result;
+ next;
+ }
+ if (not defined $chosen_result->{variant} or
+ (defined $result->{variant} and
+ $chosen_result->{variant} > $result->{variant})) {
+ $chosen_result = $result;
+ next;
+ }
+ }
+ if (defined $chosen_result and $chosen_result->{exon}) {
+ # try to figure out the cds if there is mrna information and
+ # we're in an exon
+ my $cds_rv = $param{cds_sth}->execute($param{pos},$param{pos},$param{chr}) or
+ die "Unable to execute statement properly: ".$param{dbh}->errstr;
+ my $cds_results = $param{cds_sth}->fetchall_arrayref({});
+ if ($param{cds_sth}->err) {
+ print STDERR $param{cds_sth}->errstr;
+ }
+ else {
+ print STDERR Data::Dumper->Dump([$cds_results],[qw(cds_results)]) if $DEBUG;
+ my $chosen_cds;
+ for my $cds_result (@{$cds_results}) {
+ if (not defined $chosen_cds) {
+ $chosen_cds = $cds_result;
+ next;
+ }
+ if (not defined $cds_result->{variant} or
+ (defined $chosen_cds->{variant} and
+ $chosen_cds->{variant} ge $cds_result->{variant})) {
+ $chosen_cds = $cds_result;
+ next;
+ }
+ }
+ if (defined $chosen_cds and keys %{$chosen_cds}) {
+ my $comp_offset = $chosen_cds->{complement}?-1:1;
+ $chosen_result->{in_cds} = 1;
+ $chosen_result->{cds_pos} =
+ ($chosen_cds->{complement}?$chosen_cds->{stop}-$param{pos}:$param{pos}-$chosen_cds->{start}) +
+ $chosen_cds->{position};
+ $chosen_result->{gene} = $chosen_cds->{gene};
+ $chosen_result->{pos} = $chosen_cds->{pos};
+ $chosen_result->{number} = $chosen_cds->{number};
+ $chosen_result->{variant} = $chosen_cds->{variant};
+ $chosen_result->{acc} = $chosen_cds->{accession};
+ $chosen_result->{prot_pos} = sprintf("%.1f",$chosen_result->{cds_pos}/ 3);
+ $chosen_result->{phase} = ($chosen_cds->{phase} + $param{pos} - $chosen_cds->{start}) % 3;
+ print STDERR Data::Dumper->Dump([$chosen_cds,$chosen_result],[qw(chosen_cds chosen_result)]) if $DEBUG;
+ my $mrna_cds_rv = $param{mrna_cds_sth}->execute($chosen_cds->{gi}) or
+ die "Unable to execute statement properly: ".$param{dbh}->errstr;
+ my $mrna_cds_result = $param{mrna_cds_sth}->fetchrow_hashref();
+ if (defined $mrna_cds_result and keys %{$mrna_cds_result}) {
+ print STDERR Data::Dumper->Dump([$mrna_cds_result,$chosen_result,\%param],
+ [qw(mrna_cds_result chosen_result *param)]) if $DEBUG;
+ my $codon = substr $mrna_cds_result->{sequence},$chosen_result->{cds_pos} - (($chosen_result->{cds_pos}) % 3) ,3;
+ my $orig_aa = convert_to_prot_seq($codon);
+ my $modified_codon = $codon;
+ substr($modified_codon,$chosen_result->{cds_pos} % 3,1)=
+ ($chosen_cds->{complement}?dna_complement($param{alt}):$param{alt});
+ my $new_aa = convert_to_prot_seq($modified_codon);
+ print STDERR Dumper($mrna_cds_result,$chosen_result,$chosen_cds,{codon_pos => $chosen_result->{cds_pos} - ($chosen_result->{cds_pos} % 3),
+ codon=>$codon,modified_codon=>$modified_codon,pos=>$param{pos}}) if $DEBUG;
+ if ($DEBUG > 1) {
+ my $a = -1;
+ print STDERR map {$a++; ($a*3)." $_\n"} $mrna_cds_result->{sequence} =~ m/.../g;
+ }
+ $chosen_result->{orig_aa} = $orig_aa;
+ $chosen_result->{new_aa} = $new_aa;
+ $chosen_result->{new_codon} = $modified_codon;
+ $chosen_result->{orig_codon} = $codon;
+ $chosen_result->{nonsyn} = $new_aa ne $orig_aa ? 1 : 0;
+ }
+ }
+ }
+ }
+ return $chosen_result // {};
+}
+
+
+sub fixup_chr {
+ my ($chr) = @_;
+ return $chr if not defined $chr;
+
+ $chr =~ s/^NC_00000?//;
+ $chr =~ s/^chr//;
+ if (looks_like_number($chr)) {
+ if ($chr == 23) {
+ $chr = "X"
+ }
+ elsif ($chr == 24) {
+ $chr = "Y"
+ }
+ }
+ return ($chr);
+}
+
+sub allele_freq_information{
+ my %param = @_;
+
+ return {} if $param{id} !~ /^\d+$/;
+
+ my $rv = $param{sth}->execute($param{id},$param{chr}) or
+ die "Unable to execute statement properly: ".$param{dbh}->errstr." id:".$param{id}." chr:".$param{chr};
+
+ my $results = $param{sth}->fetchall_arrayref({});
+ if ($param{sth}->err) {
+ print STDERR $param{sth}->errstr;
+ return {};
+ }
+ my %pop_alleles;
+ my %alleles;
+ my %seen_indiv;
+ my %indivs;
+ for my $result (@{$results}) {
+ next if exists $seen_indiv{$result->{submitted_ind_id}.'.'.$result->{chr_num}};
+ $indivs{$result->{submitted_ind_id}}{'chr_'.$result->{chr_num}} = $result->{allele};
+ if (defined $result->{loc_pop_id}) {
+ $indivs{$result->{submitted_ind_id}}{loc_pop_id} = $result->{loc_pop_id};
+ $indivs{$result->{submitted_ind_id}}{pop_handle} = $result->{pop_handle};
+ $pop_alleles{$result->{loc_pop_id}}{$result->{allele}}++;
+ $pop_alleles{$result->{loc_pop_id}}{total}++;
+ }
+ $alleles{$result->{allele}}++;
+ $alleles{total}++;
+ $seen_indiv{$result->{submitted_ind_id}.'.'.$result->{chr_num}} = 1;
+ }
+ my %genotypes;
+ my %pop_genotypes;
+ for my $indiv (values %indivs) {
+ if (defined $indiv->{chr_1} and defined $indiv->{chr_2}) {
+ my $genotype = join('/',
+ sort($indiv->{chr_1},
+ $indiv->{chr_2}));
+ if (defined $indiv->{loc_pop_id}) {
+ $pop_genotypes{$indiv->{loc_pop_id}}{$genotype}++;
+ $pop_genotypes{$indiv->{loc_pop_id}}{total}++;
+ }
+ $genotypes{$genotype}++;
+ $genotypes{total}++;
+ }
+ }
+ return {pop_alleles => \%pop_alleles,
+ pop_genotypes => \%pop_genotypes,
+ genotypes => \%genotypes,
+ alleles => \%alleles,
+ };
+}
+
+sub allele_freq{
+ my ($freq_table,$allele) = @_;
+ my $flipped_allele = $allele;
+ $flipped_allele =~ tr/ACTG/TGAC/;
+ if (defined $allele) {
+ if (exists $freq_table->{$allele} and defined $freq_table->{$allele}) {
+ return $freq_table->{$allele}/$freq_table->{total};
+ }
+ elsif (exists $freq_table->{$flipped_allele} and defined $freq_table->{$flipped_allele}) {
+ return $freq_table->{$flipped_allele}/$freq_table->{total};
+ }
+ return 0;
+ }
+ my @alleles = grep {$_ ne 'total'} keys %{$freq_table};
+ return join(',',map {$_.':'.(defined $freq_table->{$_} ?
+ $freq_table->{$_}/$freq_table->{total} : 0)} @alleles);
+}
+
+
+sub find_rsid {
+ my %param = @_;
+
+ my $rv = $param{sth}->execute($param{chr},$param{pos}) //
+ die "Unable to execute statement properly: ".$param{dbh}->errstr;
+ my @snp_ids = grep {$_ =~ /^\d+$/}
+ map {ref $_ eq 'ARRAY'? @{$_}:$_}
+ map {ref $_ eq 'ARRAY'? @{$_}:$_}
+ $sth->fetchall_arrayref([0]);
+ if ($sth->err) {
+ print STDERR $sth->errstr;
+ return '?';
+ }
+ if (not @snp_ids) {
+ return '.';
+ }
+ return join(',',@snp_ids);
+}
+
+sub dna_complement{
+ my ($seq) = @_;
+ return join('',map {tr /ATCG/TAGC/; $_} reverse split //,$seq);
+}
+
+sub convert_to_prot_seq{
+ my ($mrna) = @_;
+ $mrna =~ s/U/T/g;
+ my $prot = '';
+ for my $pos (0..(length($mrna) / 3 - 1)) {
+ my $codon = substr($mrna,$pos*3,3);
+ if (not exists $prot_table->{$codon}) {
+ # this isn't quite right, as we should really be handling
+ # ambiguities which do not result in different amino
+ # acids, but for the time being, do it this way.
+ $prot .= 'N';
+ }
+ else {
+ $prot .= $prot_table->{$codon};
+ }
+ }
+ return $prot;
+}
+
+
+__DATA__
+ATT I Ile Isoleucine
+ATC I Ile Isoleucine
+ATA I Ile Isoleucine
+CTT L Leu Leucine
+CTC L Leu Leucine
+CTA L Leu Leucine
+CTG L Leu Leucine
+TTA L Leu Leucine
+TTG L Leu Leucine
+GTT V Val Valine
+GTC V Val Valine
+GTA V Val Valine
+GTG V Val Valine
+TTT F Phe Phenylalanine
+TTC F Phe Phenylalanine
+ATG M Met Methionine
+TGT C Cys Cysteine
+TGC C Cys Cysteine
+GCT A Ala Alanine
+GCC A Ala Alanine
+GCA A Ala Alanine
+GCG A Ala Alanine
+GGT G Gly Glycine
+GGC G Gly Glycine
+GGA G Gly Glycine
+GGG G Gly Glycine
+CCT P Pro Proline
+CCC P Pro Proline
+CCA P Pro Proline
+CCG P Pro Proline
+ACT T Thr Threonine
+ACC T Thr Threonine
+ACA T Thr Threonine
+ACG T Thr Threonine
+TCT S Ser Serine
+TCC S Ser Serine
+TCA S Ser Serine
+TCG S Ser Serine
+AGT S Ser Serine
+AGC S Ser Serine
+TAT Y Tyr Tyrosine
+TAC Y Tyr Tyrosine
+TGG W Trp Tryptophan
+CAA Q Gln Glutamine
+CAG Q Gln Glutamine
+AAT N Asn Asparagine
+AAC N Asn Asparagine
+CAT H His Histidine
+CAC H His Histidine
+GAA E Glu Glutamic acid
+GAG E Glu Glutamic acid
+GAT D Asp Aspartic acid
+GAC D Asp Aspartic acid
+AAA K Lys Lysine
+AAG K Lys Lysine
+CGT R Arg Arginine
+CGC R Arg Arginine
+CGA R Arg Arginine
+CGG R Arg Arginine
+AGA R Arg Arginine
+AGG R Arg Arginine
+TAA * * Stop codon
+TAG * * Stop codon
+TGA * * Stop codon
--- /dev/null
+#! /usr/bin/perl
+# snp_study_table outputs a table of study results, and is released
+# under the terms of the GPL version 2, or any later version, at your
+# option. See the file README and COPYING for more information.
+# Copyright 2011 by Don Armstrong <don@donarmstrong.com>.
+# $Id: perl_script 1825 2011-01-02 01:53:43Z don $
+
+
+use warnings;
+use strict;
+
+use Getopt::Long;
+use Pod::Usage;
+
+=head1 NAME
+
+snp_study_table - output an xls table of study results
+
+=head1 SYNOPSIS
+
+snp_study_table [options] [snps]
+
+ Options:
+ --output output filename (Default STDOUT)
+ --debug, -d debugging level (Default 0)
+ --help, -h display this help
+ --man, -m display manual
+
+=head1 SNPS
+
+SNPS can be specified as rs12345 or as chr-pos; in the later case, the
+ga_chr_pos table will be used, in the former, the ga_snp table will be
+used.
+
+=head1 OPTIONS
+
+=over
+
+=item B<--output>
+
+Output filename (defaults to STDOUT)
+
+=item B<--debug, -d>
+
+Debug verbosity. (Default 0)
+
+=item B<--help, -h>
+
+Display brief usage information.
+
+=item B<--man, -m>
+
+Display this manual.
+
+=back
+
+=head1 EXAMPLES
+
+
+=cut
+
+
+use vars qw($DEBUG);
+
+use DBI;
+use Spreadsheet::WriteExcel;
+
+my %options = (debug => 0,
+ help => 0,
+ man => 0,
+ );
+
+GetOptions(\%options,
+ 'output|o=s',
+ 'debug|d+','help|h|?','man|m');
+
+pod2usage() if $options{help};
+pod2usage({verbose=>2}) if $options{man};
+
+$DEBUG = $options{debug};
+
+my @USAGE_ERRORS;
+
+pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
+
+my $out_fh = \*STDOUT;
+if (exists $options{output}) {
+ $out_fh = IO::File->new($options{output},'w') or
+ die "Unable to open $options{output} for writing: $!";
+}
+my $wb = Spreadsheet::WriteExcel->new($out_fh);
+my $ws = $wb->add_worksheet('snp_results');
+
+my $dbh = DBI->connect("dbi:Pg:service=snp",'','',{AutoCommit => 0}) or
+ die "Unable to connect to database: ".$DBI::errstr;
+
+my $sth_ga_info = $dbh->prepare(<<'END') // die "Unable to prepare ga_snp info statement: ".$dbh->errstr;
+SELECT * FROM ga_snp WHERE snp_id=?;
+END
+
+my $sth_ga_cp_info = $dbh->prepare(<<'END') // die "Unable to prepare ga_snp info statement: ".$dbh->errstr;
+SELECT * FROM ga_chr_pos WHERE chr=? AND pos=?;
+END
+
+
+if (not @ARGV) {
+ while (<STDIN>) {
+ next if /^#/;
+ chomp;
+ push @ARGV,$_;
+ }
+}
+
+my %studies;
+my %snps;
+for my $snp (@ARGV) {
+ my $results;
+ if ($snp =~ /^(?:rs)(\d+)$/) {
+ $results = get_snp_results(dbh => $dbh,
+ sth => $sth_ga_info,
+ bind => [$1],
+ );
+ }
+ elsif ($snp =~ /^(\d+)-(\d+)$/) {
+ $results = get_snp_results(dbh => $dbh,
+ sth => $sth_ga_cp_info,
+ bind => [$1,$2],
+ );
+ }
+ else {
+ print STDERR "Invalid snp format '$snp'\n";
+ next;
+ }
+ $snps{$snp} = $results;
+ for my $result (@{$results}) {
+ $studies{$result->{study_name}}{$result->{subpart_name}} = 1;
+ }
+}
+$dbh->disconnect();
+
+#column names
+my @cn = ('A'..'Z','AA'..'ZZ');
+
+my $r = 1;
+my $c = 0;
+my $f_center = $wb->add_format(align => 'center');
+$ws->write($cn[$c++].($r+2),'SNP');
+for my $study (sort keys %studies) {
+ my $n_substudy = keys %{$studies{$study}};
+ $ws->write($cn[$c].$r,$study);
+ $ws->merge_range($r-1,$c,$r-1,$c+$n_substudy*3-1,$study,$f_center);
+ for my $substudy (sort keys %{$studies{$study}}) {
+ $ws->write($cn[$c].($r+1),$substudy);
+ $ws->merge_range($r,$c,$r,$c+2,$substudy,$f_center);
+ $ws->write($cn[$c].($r+2),'P value');
+ $ws->write($cn[$c+1].($r+2),'Q value');
+ $ws->write($cn[$c+2].($r+2),'FDR');
+ $c+=3;
+ }
+}
+$r+=3;
+$c=0;
+for my $snp (sort keys %snps) {
+ $ws->write($cn[$c++].$r,$snp);
+ my %snp_studies;
+ for my $row (@{$snps{$snp}}) {
+ $snp_studies{$row->{study_name}}{$row->{subpart_name}} = [@{$row}{qw(pvalue qvalue fdr)}];
+ }
+ for my $study (sort keys %studies) {
+ for my $substudy (sort keys %{$studies{$study}}) {
+ if (not exists $snp_studies{$study} or
+ not exists $snp_studies{$study}{$substudy}
+ ) {
+ $c+=3;
+ }
+ else {
+ for (0..2) {
+ $ws->write($cn[$c++].$r,defined $snp_studies{$study}{$substudy}[$_]?$snp_studies{$study}{$substudy}[$_]:'');
+ }
+ }
+ }
+ }
+ $r++;
+ $c=0;
+}
+
+sub get_snp_results{
+ my %param = @_;
+ my $rv = $param{sth}->execute(@{$param{bind}}) or
+ die "Unable to execute statement properly: ".$param{dbh}->errstr;
+ my $results = $param{sth}->fetchall_arrayref({});
+ if ($param{sth}->err) {
+ print STDERR $param{sth}->errstr;
+ return {};
+ }
+ return $results;
+}
+
+
+__END__
--- /dev/null
+#! /usr/bin/perl
+# vcf_add_snp_info adds additional snp information to vcf files, and
+# is released under the terms of the GPL version 2, or any later
+# version, at your option. See the file README and COPYING for more
+# information.
+# Copyright 2011 by Don Armstrong <don@donarmstrong.com>.
+
+
+use warnings;
+use strict;
+
+BEGIN {$|=1};
+
+use Getopt::Long;
+use Pod::Usage;
+
+=head1 NAME
+
+vcf_add_snp_info - Add additional snp information to vcf files
+
+=head1 SYNOPSIS
+
+ [options]
+
+ Options:
+ --debug, -d debugging level (Default 0)
+ --help, -h display this help
+ --man, -m display manual
+
+=head1 OPTIONS
+
+=over
+
+=item B<--debug, -d>
+
+Debug verbosity. (Default 0)
+
+=item B<--help, -h>
+
+Display brief usage information.
+
+=item B<--man, -m>
+
+Display this manual.
+
+=back
+
+=head1 EXAMPLES
+
+vcf_fill_rsid foo.vcf > bar.vcf
+
+=cut
+
+
+use vars qw($DEBUG);
+use DBI;
+use Data::Dumper;
+use Scalar::Util qw(looks_like_number);
+
+
+my %options = (debug => 0,
+ help => 0,
+ man => 0,
+ verbose => 0,
+ build => '135',
+ refseq => '37_3',
+ );
+
+GetOptions(\%options,
+ 'build',
+ 'refseq',
+ 'verbose|v+',
+ 'debug|d+','help|h|?','man|m');
+
+pod2usage() if $options{help};
+pod2usage({verbose=>2}) if $options{man};
+
+$DEBUG = $options{debug};
+
+my @USAGE_ERRORS;
+# if (1) {
+# push @USAGE_ERRORS,"You must pass something";
+# }
+
+pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
+
+
+my $dbh = DBI->connect("dbi:Pg:service=snp",'','',{AutoCommit => 0}) or
+ die "Unable to connect to database: ".$DBI::errstr;
+
+my $sth = $dbh->prepare(<<"END") // die "Unable to prepare snpinfo statement: ".$dbh->errstr;
+SELECT snp_id FROM b$options{build}_snpchrposonref_$options{refseq} WHERE chr=? AND pos=?;
+END
+
+my $real_snp_sth = $dbh->prepare(<<'END') // die "Unable to prepare realsnp statement: ".$dbh->errstr;
+SELECT DISTINCT rscurrent FROM rsmergearch WHERE rslow=? OR rshigh=?
+END
+
+my $sth_mrna_ss = $dbh->prepare(<<'END') // die "Unable to prepare mrna_ss statement: ".$dbh->errstr;
+SELECT * FROM mrna WHERE start<=? AND stop>=? AND chr=? ORDER BY variant;
+END
+
+my $sth_cds_ss = $dbh->prepare(<<'END') // die "Unable to prepare cds_ss statement: ".$dbh->errstr;
+SELECT * FROM cds WHERE start<=? AND stop>=? AND chr=? ORDER BY variant;
+END
+
+my $sth_mrna_cds = $dbh->prepare(<<'END') // die "Unable to prepare mrna_cds statement: ".$dbh->errstr;
+SELECT * FROM mrna_cds_table WHERE cds_gi=?;
+END
+
+my $sth_snp_history = $dbh->prepare(<<'END') // die "Unable to prepare snp_history statement: ".$dbh->errstr;
+SELECT * FROM snphistory WHERE snp_id=?;
+END
+
+
+my $sth_allele_freq = $dbh->prepare(<<'END') // die "Unable to prepare allele_freq statement: ".$dbh->errstr;
+SELECT ssl.snp_id AS snp_id,
+ si.subsnp_id AS subsnp_id,
+-- ssl.substrand_reversed_flag,
+ si.chr AS chr,
+ si.submitted_ind_id AS submitted_ind_id,
+ ga.chr_num AS chr_num,
+-- si.submitted_strand_code,
+-- si.allele_flag,
+ ga.rev_flag AS rev_flag,
+ a.allele AS allele,
+ ra.allele AS rev_allele,
+ p.handle AS pop_handle,
+ p.loc_pop_id AS loc_pop_id
+ FROM snpsubsnplink ssl
+ JOIN subind si on ssl.subsnp_id=si.subsnp_id
+ JOIN gtyallele ga on si.gty_id=ga.gty_id
+ JOIN allele a on ga.fwd_allele_id = a.allele_id
+ JOIN allele ra on a.rev_allele_id = ra.allele_id
+ JOIN submittedindividual sind on si.submitted_ind_id=sind.submitted_ind_id
+ JOIN population p on sind.pop_id=p.pop_id
+ WHERE ssl.snp_id=? AND
+ si.chr=? AND ga.rev_flag = '0' AND
+ a.allele <> 'N';
+END
+
+my $sth_rsid_info = $dbh->prepare(<<'END') // die "Unable to prepare rsid info statement: ".$dbh->errstr;
+SELECT * FROM cds WHERE start<=? AND stop>=? AND chr=? ORDER BY variant;
+END
+
+my $sth_ga_info = $dbh->prepare(<<'END') // die "Unable to prepare ga_snp info statement: ".$dbh->errstr;
+SELECT * FROM ga_snp WHERE snp_id=?;
+END
+
+my $prot_table;
+
+while (<DATA>) {
+ chomp;
+ my ($codon,$aa,$aa_med,$aa_long) = split /\t/;
+ $prot_table->{$codon} = $aa;
+}
+
+my @headers = qw(chr pos id ref alt qual filter info format);
+while (<>) {
+ print $_ and next if /^#/;
+ chomp;
+ my @row = split /\t/, $_;
+ my %row;
+ @row{@headers} = @row[0..$#headers];
+ # this is taken directly from vcf_fill_rsid; we want it for the time being anyway
+ if ($row{id} !~ /^(?:rs|ss)?\d+(?:,\s*(?:rs|ss)?\d+)*$/) {
+ $row{id} = find_rsid(chr => fixup_chr($row{chr}),
+ # snps are 0 indexed
+ pos => $row{pos} - 1,
+ dbh => $dbh,
+ sth => $sth,
+ );
+ }
+ if ($row{id} =~ /(rs|ss)(\d+)/) {
+ my $type = $1;
+ my $id = $2;
+ $id = find_real_snp_id(id => $id,
+ dbh => $dbh,
+ sth => $real_snp_sth,
+ );
+ if (not defined $id and
+ is_retired_snp(id => $id,
+ dbh => $dbh,
+ sth => $sth_snp_history,
+ )) {
+ if (defined $row{chr} and defined $row{pos}) {
+ $row{id} = undef;
+ }
+ }
+ else {
+ $row{id} = "$type$id";
+ }
+ }
+ my $mrna_information = mrna_information(dbh => $dbh,
+ sth => $sth_mrna_ss,
+ # these are all 1 indexed
+ pos => $row{pos},
+ chr => fixup_chr($row{chr}),
+ ref => $row{ref},
+ alt => $row{alt},
+ cds_sth => $sth_cds_ss,
+ mrna_cds_sth => $sth_mrna_cds,
+ );
+ if (keys %{$mrna_information}) {
+ # strip out existing information in $row{info}
+ $row{info} =~ s{(?:GENE|(?:CDS|PROT|)POS|NUM|VAR|ACC|NONSYN|(?:ALT|REF)(?:AA|CODON))\=(?:[^\;]+|"[^"]+")(?:;|$)}{}g;
+ my @extra_info;
+ push @extra_info,"GENE=".$mrna_information->{gene};
+ push @extra_info,"NONSYN=".1 if defined $mrna_information->{nonsyn} and $mrna_information->{nonsyn};
+ push @extra_info,"REFAA=".$mrna_information->{orig_aa} if defined $mrna_information->{orig_aa};
+ push @extra_info,"ALTAA=".$mrna_information->{new_aa} if defined $mrna_information->{new_aa};
+ push @extra_info,"REFCODON=".$mrna_information->{orig_codon} if defined $mrna_information->{orig_codon};
+ push @extra_info,"ALTCODON=".$mrna_information->{new_codon} if defined $mrna_information->{new_codon};
+ push @extra_info,"CDSPOS=".$mrna_information->{cds_pos} if defined $mrna_information->{cds_pos};
+ push @extra_info,"PROTPOS=".$mrna_information->{prot_pos} if defined $mrna_information->{prot_pos};
+ push @extra_info,"POS=".($mrna_information->{exon}?'exon':'intron');
+ push @extra_info,"NUM=".$mrna_information->{number};
+ push @extra_info,"VAR=".$mrna_information->{variant} if defined $mrna_information->{variant};
+ push @extra_info,"ACC=".$mrna_information->{accession};
+ push @extra_info,"ACCVER=".$mrna_information->{accession_ver};
+ push @extra_info,"PHASE=".$mrna_information->{phase} if defined $mrna_information->{phase};
+ $row{info} = join(';',@extra_info,$row{info});
+ }
+ my $allele_freq_information = allele_freq_information(dbh => $dbh,
+ sth => $sth_allele_freq,
+ id => $row{id},
+ # these are all 1 indexed
+ pos => $row{pos},
+ chr => fixup_chr($row{chr}),
+ ref => $row{ref},
+ alt => $row{alt},
+ );
+ if (keys %{$allele_freq_information}) {
+ # strip out existing information in $row{info}
+ $row{info} =~ s{(?:REFFREQ|ALTFREQ|REFALTN)\=(?:[^\;]+|"[^"]+")(?:;|$)}{}g;
+ my @extra_info;
+ if (exists $allele_freq_information->{alleles} and keys %{$allele_freq_information->{alleles}}) {
+ push @extra_info,"REFFREQ=".sprintf('%.3f',allele_freq($allele_freq_information->{alleles},$row{ref}));
+ push @extra_info,"ALTFREQ=".sprintf('%.3f',allele_freq($allele_freq_information->{alleles},$row{alt}));
+ push @extra_info,"REFALTN=".$allele_freq_information->{alleles}{total};
+ }
+ $row{info} = join(';',@extra_info,$row{info});
+ }
+ my $ga_study_information = ga_study_information(dbh => $dbh,
+ sth => $sth_ga_info,
+ id => $row{id},
+ );
+ if (defined $ga_study_information) {
+ # strip out existing information in $row{info}
+ $row{info} =~ s{(?:GASTUDY|GASTUDYSIG)\=(?:[^\;]+|"[^"]+")(?:;|$)}{}g;
+ my @extra_info;
+ push @extra_info,"GASTUDY=".$ga_study_information->{studies};
+ push @extra_info,"GASTUDYSIG=".($ga_study_information->{significant}?'TRUE':'FALSE');
+ $row{info} = join(';',@extra_info,$row{info});
+ }
+ print join("\t",@row{@headers},$#row > $#headers?@row[@headers..$#row]:())."\n";
+ if ($options{verbose} > 0 and $. % 100 == 0) {
+ print STDERR "At line $.\n";
+ }
+}
+
+sub ga_study_information {
+ my %param = @_;
+
+ return undef unless $param{id} =~ /^\d+$/;
+ my $rv = $param{sth}->execute($param{id}) or
+ die "Unable to execute statement properly: ".$param{dbh}->errstr;
+ my $results = $param{sth}->fetchall_arrayref({});
+ if (not @{$results}) {
+ return undef;
+ }
+ my $any_significant=0;
+ my @values;
+ for my $result (sort {
+ if (looks_like_number($a->{pvalue})) {
+ if (looks_like_number($b->{pvalue})) {
+ $a->{pvalue} <=> $b->{pvalue};
+ } else {
+ -1;
+ }
+ } elsif (looks_like_number($b->{pvalue})) {
+ 1;
+ } else {
+ 0;
+ }} @{$results}) {
+ if ($result->{significant}) {
+ $any_significant=1;
+ }
+ my $value = $result->{study_name}.':'.$result->{subpart_name};
+ for my $t (qw(pvalue qvalue fdr)) {
+ next unless exists $result->{$t} and defined $result->{$t} and length $result->{$t};
+ my ($fl) = $t =~ m/^(.)/;
+ $value .= ":$fl:".sprintf('%.3g',$result->{$t});
+ }
+ push @values, $value;
+ }
+ return {significant => $any_significant,
+ studies => join(',',@values),
+ };
+
+}
+
+sub mrna_information{
+ my %param = @_;
+
+ my $rv = $param{sth}->execute($param{pos},$param{pos},$param{chr}) or
+ die "Unable to execute statement properly: ".$param{dbh}->errstr;
+
+ my $results = $param{sth}->fetchall_arrayref({});
+ if ($param{sth}->err) {
+ print STDERR $param{sth}->errstr;
+ return {};
+ }
+ my $chosen_result;
+ for my $result (@{$results}) {
+ if (not defined $chosen_result) {
+ $chosen_result = $result;
+ next;
+ }
+ if (not defined $chosen_result->{variant} or
+ (defined $result->{variant} and
+ $chosen_result->{variant} > $result->{variant})) {
+ $chosen_result = $result;
+ next;
+ }
+ }
+ if (defined $chosen_result and $chosen_result->{exon}) {
+ # try to figure out the cds if there is mrna information and
+ # we're in an exon
+ my $cds_rv = $param{cds_sth}->execute($param{pos},$param{pos},$param{chr}) or
+ die "Unable to execute statement properly: ".$param{dbh}->errstr;
+ my $cds_results = $param{cds_sth}->fetchall_arrayref({});
+ if ($param{cds_sth}->err) {
+ print STDERR $param{cds_sth}->errstr;
+ }
+ else {
+ print STDERR Data::Dumper->Dump([$cds_results],[qw(cds_results)]) if $DEBUG;
+ my $chosen_cds;
+ for my $cds_result (@{$cds_results}) {
+ if (not defined $chosen_cds) {
+ $chosen_cds = $cds_result;
+ next;
+ }
+ if (not defined $cds_result->{variant} or
+ (defined $chosen_cds->{variant} and
+ $chosen_cds->{variant} ge $cds_result->{variant})) {
+ $chosen_cds = $cds_result;
+ next;
+ }
+ }
+ if (defined $chosen_cds and keys %{$chosen_cds}) {
+ my $comp_offset = $chosen_cds->{complement}?-1:1;
+ $chosen_result->{in_cds} = 1;
+ $chosen_result->{cds_pos} =
+ ($chosen_cds->{complement}?$chosen_cds->{stop}-$param{pos}:$param{pos}-$chosen_cds->{start}) +
+ $chosen_cds->{position};
+ $chosen_result->{gene} = $chosen_cds->{gene};
+ $chosen_result->{pos} = $chosen_cds->{pos};
+ $chosen_result->{number} = $chosen_cds->{number};
+ $chosen_result->{variant} = $chosen_cds->{variant};
+ $chosen_result->{acc} = $chosen_cds->{accession};
+ $chosen_result->{accver} = $chosen_cds->{accession_ver};
+ $chosen_result->{prot_pos} = sprintf("%.1f",$chosen_result->{cds_pos}/ 3);
+ $chosen_result->{phase} = ($chosen_cds->{phase} + $param{pos} - $chosen_cds->{start}) % 3;
+ print STDERR Data::Dumper->Dump([$chosen_cds,$chosen_result],[qw(chosen_cds chosen_result)]) if $DEBUG;
+ my $mrna_cds_rv = $param{mrna_cds_sth}->execute($chosen_cds->{gi}) or
+ die "Unable to execute statement properly: ".$param{dbh}->errstr;
+ my $mrna_cds_result = $param{mrna_cds_sth}->fetchrow_hashref();
+ if (defined $mrna_cds_result and keys %{$mrna_cds_result}) {
+ print STDERR Data::Dumper->Dump([$mrna_cds_result,$chosen_result,\%param],
+ [qw(mrna_cds_result chosen_result *param)]) if $DEBUG;
+ my $codon = substr $mrna_cds_result->{sequence},$chosen_result->{cds_pos} - (($chosen_result->{cds_pos}) % 3) ,3;
+ my $orig_aa = convert_to_prot_seq($codon);
+ my $modified_codon = $codon;
+ substr($modified_codon,$chosen_result->{cds_pos} % 3,1)=
+ ($chosen_cds->{complement}?dna_complement($param{alt}):$param{alt});
+ my $new_aa = convert_to_prot_seq($modified_codon);
+ print STDERR Dumper($mrna_cds_result,$chosen_result,$chosen_cds,{codon_pos => $chosen_result->{cds_pos} - ($chosen_result->{cds_pos} % 3),
+ codon=>$codon,modified_codon=>$modified_codon,pos=>$param{pos}}) if $DEBUG;
+ if ($DEBUG > 1) {
+ my $a = -1;
+ print STDERR map {$a++; ($a*3)." $_\n"} $mrna_cds_result->{sequence} =~ m/.../g;
+ }
+ $chosen_result->{orig_aa} = $orig_aa;
+ $chosen_result->{new_aa} = $new_aa;
+ $chosen_result->{new_codon} = $modified_codon;
+ $chosen_result->{orig_codon} = $codon;
+ $chosen_result->{nonsyn} = $new_aa ne $orig_aa ? 1 : 0;
+ }
+ }
+ }
+ }
+ return $chosen_result // {};
+}
+
+sub find_real_snp_id {
+ my %param = @_;
+
+ my %info;
+ my $rv = $param{sth}->execute($param{id},$param{id}) //
+ die "Unable to execute statement properly: ".$param{dbh}->errstr;
+ my ($real_id) = map {ref $_ ?@{$_}:()} map {ref $_ ?@{$_}:()} $param{sth}->fetchall_arrayref([0]);
+ if ($param{sth}->err) {
+ print STDERR $param{sth}->errstr;
+ $param{sth}->finish;
+ return $param{id};
+ }
+ $param{sth}->finish;
+ return $real_id // $param{id};
+}
+
+sub is_retired_snp {
+ my %param = @_;
+
+ my %info;
+ my $rv = $param{sth}->execute($param{id}) //
+ die "Unable to execute statement properly: ".$param{dbh}->errstr;
+ if ($param{sth}->rows >= 1) {
+ return 1;
+ }
+ return 0;
+}
+
+
+sub fixup_chr {
+ my ($chr) = @_;
+
+ $chr =~ s/^NC_00000?//;
+ $chr =~ s/^chr//;
+ if (looks_like_number($chr)) {
+ if ($chr == 23) {
+ $chr = "X"
+ }
+ elsif ($chr == 24) {
+ $chr = "Y"
+ }
+ }
+ return ($chr);
+}
+
+sub allele_freq_information{
+ my %param = @_;
+
+ return {} if $param{id} !~ /^\d+$/;
+
+ my $rv = $param{sth}->execute($param{id},$param{chr}) or
+ die "Unable to execute statement properly: ".$param{dbh}->errstr." id:".$param{id}." chr:".$param{chr};
+
+ my $results = $param{sth}->fetchall_arrayref({});
+ if ($param{sth}->err) {
+ print STDERR $param{sth}->errstr;
+ return {};
+ }
+ my %pop_alleles;
+ my %alleles;
+ my %seen_indiv;
+ my %indivs;
+ for my $result (@{$results}) {
+ next if exists $seen_indiv{$result->{submitted_ind_id}.'.'.$result->{chr_num}};
+ $indivs{$result->{submitted_ind_id}}{pop_handle} = $result->{pop_handle};
+ $indivs{$result->{submitted_ind_id}}{loc_pop_id} = $result->{loc_pop_id};
+ $indivs{$result->{submitted_ind_id}}{'chr_'.$result->{chr_num}} = $result->{allele};
+ if (defined $result->{loc_pop_id}) {
+ $pop_alleles{$result->{loc_pop_id}}{$result->{allele}}++;
+ $pop_alleles{$result->{loc_pop_id}}{total}++;
+ }
+ $alleles{$result->{allele}}++;
+ $alleles{total}++;
+ $seen_indiv{$result->{submitted_ind_id}.'.'.$result->{chr_num}} = 1;
+ }
+ my %genotypes;
+ my %pop_genotypes;
+ for my $indiv (values %indivs) {
+ if (defined $indiv->{chr_1} and defined $indiv->{chr_2}) {
+ my $genotype = join('/',
+ sort($indiv->{chr_1},
+ $indiv->{chr_2}));
+ $pop_genotypes{$indiv->{loc_pop_id}}{$genotype}++;
+ $pop_genotypes{$indiv->{loc_pop_id}}{total}++;
+ $genotypes{$genotype}++;
+ $genotypes{total}++;
+ }
+ }
+ return {pop_alleles => \%pop_alleles,
+ pop_genotypes => \%pop_genotypes,
+ genotypes => \%genotypes,
+ alleles => \%alleles,
+ };
+}
+
+sub allele_freq{
+ my ($freq_table,$allele) = @_;
+ if (defined $allele) {
+ return ((exists $freq_table->{$allele} and defined $freq_table->{$allele}) ?
+ $freq_table->{$allele}/$freq_table->{total} : 0);
+ }
+ my @alleles = grep {$_ ne 'total'} keys %{$freq_table};
+ return join(',',map {$_.':'.(defined $freq_table->{$_} ?
+ $freq_table->{$_}/$freq_table->{total} : 0)} @alleles);
+}
+
+
+sub find_rsid {
+ my %param = @_;
+
+ my $rv = $param{sth}->execute($param{chr},$param{pos}) //
+ die "Unable to execute statement properly: ".$param{dbh}->errstr;
+ my @snp_ids = grep {$_ =~ /^\d+$/}
+ map {ref $_ eq 'ARRAY'? @{$_}:$_}
+ map {ref $_ eq 'ARRAY'? @{$_}:$_}
+ $sth->fetchall_arrayref([0]);
+ if ($sth->err) {
+ print STDERR $sth->errstr;
+ return '?';
+ }
+ if (not @snp_ids) {
+ return '.';
+ }
+ return join(',',@snp_ids);
+}
+
+sub dna_complement{
+ my ($seq) = @_;
+ return join('',map {tr /ATCG/TAGC/; $_} reverse split //,$seq);
+}
+
+sub convert_to_prot_seq{
+ my ($mrna) = @_;
+ $mrna =~ s/U/T/g;
+ my $prot = '';
+ for my $pos (0..(length($mrna) / 3 - 1)) {
+ my $codon = substr($mrna,$pos*3,3);
+ if (not exists $prot_table->{$codon}) {
+ # this isn't quite right, as we should really be handling
+ # ambiguities which do not result in different amino
+ # acids, but for the time being, do it this way.
+ $prot .= 'N';
+ }
+ else {
+ $prot .= $prot_table->{$codon};
+ }
+ }
+ return $prot;
+}
+
+
+__DATA__
+ATT I Ile Isoleucine
+ATC I Ile Isoleucine
+ATA I Ile Isoleucine
+CTT L Leu Leucine
+CTC L Leu Leucine
+CTA L Leu Leucine
+CTG L Leu Leucine
+TTA L Leu Leucine
+TTG L Leu Leucine
+GTT V Val Valine
+GTC V Val Valine
+GTA V Val Valine
+GTG V Val Valine
+TTT F Phe Phenylalanine
+TTC F Phe Phenylalanine
+ATG M Met Methionine
+TGT C Cys Cysteine
+TGC C Cys Cysteine
+GCT A Ala Alanine
+GCC A Ala Alanine
+GCA A Ala Alanine
+GCG A Ala Alanine
+GGT G Gly Glycine
+GGC G Gly Glycine
+GGA G Gly Glycine
+GGG G Gly Glycine
+CCT P Pro Proline
+CCC P Pro Proline
+CCA P Pro Proline
+CCG P Pro Proline
+ACT T Thr Threonine
+ACC T Thr Threonine
+ACA T Thr Threonine
+ACG T Thr Threonine
+TCT S Ser Serine
+TCC S Ser Serine
+TCA S Ser Serine
+TCG S Ser Serine
+AGT S Ser Serine
+AGC S Ser Serine
+TAT Y Tyr Tyrosine
+TAC Y Tyr Tyrosine
+TGG W Trp Tryptophan
+CAA Q Gln Glutamine
+CAG Q Gln Glutamine
+AAT N Asn Asparagine
+AAC N Asn Asparagine
+CAT H His Histidine
+CAC H His Histidine
+GAA E Glu Glutamic acid
+GAG E Glu Glutamic acid
+GAT D Asp Aspartic acid
+GAC D Asp Aspartic acid
+AAA K Lys Lysine
+AAG K Lys Lysine
+CGT R Arg Arginine
+CGC R Arg Arginine
+CGA R Arg Arginine
+CGG R Arg Arginine
+AGA R Arg Arginine
+AGG R Arg Arginine
+TAA * * Stop codon
+TAG * * Stop codon
+TGA * * Stop codon