]> git.donarmstrong.com Git - bio-dbsnp.git/commitdiff
add base versions of routines master
authorDon Armstrong <don@donarmstrong.com>
Wed, 21 Mar 2012 22:50:20 +0000 (15:50 -0700)
committerDon Armstrong <don@donarmstrong.com>
Wed, 21 Mar 2012 22:50:20 +0000 (15:50 -0700)
bin/snp_info [new file with mode: 0755]
bin/snp_study_table [new file with mode: 0755]
bin/vcf_add_snp_info [new file with mode: 0755]

diff --git a/bin/snp_info b/bin/snp_info
new file mode 100755 (executable)
index 0000000..545d2f3
--- /dev/null
@@ -0,0 +1,706 @@
+#!/usr/bin/perl
+# snp_info outputs additional snp information is released under the
+# terms of the GPL version 2, or any later version, at your option.
+# See the file README and COPYING for more information.
+# Copyright 2011 by Don Armstrong <don@donarmstrong.com>.
+
+
+use warnings;
+use strict;
+
+BEGIN {$|=1};
+
+use Getopt::Long;
+use Pod::Usage;
+
+=head1 NAME
+
+snp_info - output snp information
+
+=head1 SYNOPSIS
+
+ [options]
+
+ Options:
+  --debug, -d debugging level (Default 0)
+  --help, -h display this help
+  --man, -m display manual
+
+=head1 OPTIONS
+
+=over
+
+=item B<--nearest-gene>
+
+Return the name of the nearest gene if the SNP isn't in a gene
+
+=item B<--debug, -d>
+
+Debug verbosity. (Default 0)
+
+=item B<--help, -h>
+
+Display brief usage information.
+
+=item B<--man, -m>
+
+Display this manual.
+
+=back
+
+=head1 EXAMPLES
+
+echo 'rs1782034' | snp_info;
+
+=cut
+
+
+use vars qw($DEBUG);
+use DBI;
+use Data::Dumper;
+use Scalar::Util qw(looks_like_number);
+
+
+my %options = (debug           => 0,
+              help            => 0,
+              man             => 0,
+              verbose         => 0,
+              fix_pos         => 1,
+              nearest_gene    => 0,
+              build           => '135',
+              refseq          => '37_3',
+              );
+
+GetOptions(\%options,
+          'verbose|v+',
+          'fix_pos|fix-pos!',
+          'nearest_gene|nearest-gene!',
+          'debug|d+','help|h|?','man|m');
+
+pod2usage() if $options{help};
+pod2usage({verbose=>2}) if $options{man};
+
+$DEBUG = $options{debug};
+
+my @USAGE_ERRORS;
+# if (1) {
+#      push @USAGE_ERRORS,"You must pass something";
+# }
+
+pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
+
+
+my $dbh = DBI->connect("dbi:Pg:service=snp",'','',{AutoCommit => 0}) or
+    die "Unable to connect to database: ".$DBI::errstr;
+
+my $real_snp_sth = $dbh->prepare(<<'END') // die "Unable to prepare realsnp statement: ".$dbh->errstr;
+SELECT DISTINCT rscurrent FROM rsmergearch WHERE rslow=? OR rshigh=?
+END
+
+my $sth_snp_history = $dbh->prepare(<<'END') // die "Unable to prepare snp_history statement: ".$dbh->errstr;
+SELECT * FROM snphistory WHERE snp_id=?;
+END
+
+my $snp_chr_pos_sth = $dbh->prepare(<<"END") // die "Unable to prepare snpinfo statement: ".$dbh->errstr;
+SELECT scpr.snp_id AS snp_id,
+       scpr.chr AS chr,
+       scpr.pos AS pos,
+       scl.phys_pos_from AS pos2,
+       scpr.orien AS orien,
+       scl.allele AS ref,
+       uv.var_str AS var_str,
+       ruv.var_str AS rev_var_str,
+       ml.locus_id AS uid,
+       ml.locus_symbol AS symbol,
+       gitn.gene_name AS description,
+       scl.aln_quality AS alignment_quality
+       FROM snp s
+         JOIN b$options{build}_snpchrposonref_$options{refseq} scpr ON s.snp_id=scpr.snp_id
+         JOIN b$options{build}_snpcontigloc_$options{refseq} scl ON scpr.snp_id=scl.snp_id
+         JOIN b$options{build}_contiginfo_$options{refseq} ci ON scl.ctg_id = ci.ctg_id
+         LEFT OUTER JOIN b$options{build}_snpcontiglocusid_$options{refseq} ml ON s.snp_id=ml.snp_id
+         LEFT OUTER JOIN geneidtoname gitn ON ml.locus_id=gitn.gene_id
+         JOIN univariation uv ON s.univar_id=uv.univar_id
+         JOIN univariation ruv ON uv.rev_univar_id=ruv.univar_id
+WHERE (ci.group_term LIKE 'GRCh%' OR ci.group_term='Primary_Assembly') AND s.snp_id=? ORDER BY scl.aln_quality;
+END
+
+my $sth = $dbh->prepare(<<"END") // die "Unable to prepare snpinfo statement: ".$dbh->errstr;
+SELECT snp_id FROM b$options{build}_snpchrposonref_$options{refseq} WHERE chr=? AND pos=?;
+END
+
+my $sth_mrna_ss = $dbh->prepare(<<'END') // die "Unable to prepare mrna_ss statement: ".$dbh->errstr;
+SELECT * FROM mrna WHERE start<=? AND stop>=? AND chr=? ORDER BY variant;
+END
+
+my $sth_cds_ss = $dbh->prepare(<<'END') // die "Unable to prepare cds_ss statement: ".$dbh->errstr;
+SELECT * FROM cds WHERE start<=? AND stop>=? AND chr=? ORDER BY variant;
+END
+
+my $sth_mrna_cds = $dbh->prepare(<<'END') // die "Unable to prepare mrna_cds statement: ".$dbh->errstr;
+SELECT * FROM mrna_cds_table WHERE cds_gi=?;
+END
+
+my $sth_allele_freq = $dbh->prepare(<<'END') // die "Unable to prepare allele_freq statement: ".$dbh->errstr;
+SELECT ssl.snp_id AS snp_id,
+       si.subsnp_id AS subsnp_id,
+       si.chr AS chr,
+       si.submitted_ind_id AS submitted_ind_id,
+       ga.chr_num AS chr_num,
+       ga.rev_flag AS rev_flag,
+       a.allele AS allele
+  FROM snpsubsnplink ssl
+    JOIN subind si on ssl.subsnp_id=si.subsnp_id
+    JOIN gtyallele ga on si.gty_id=ga.gty_id
+    JOIN allele a on ga.fwd_allele_id = a.allele_id
+    WHERE ssl.snp_id=? AND
+          si.chr=? AND ga.rev_flag = '0' AND
+          a.allele <> 'N';
+END
+
+my $sth_rsid_info = $dbh->prepare(<<'END') // die "Unable to prepare rsid info statement: ".$dbh->errstr;
+SELECT * FROM cds WHERE start<=? AND stop>=? AND chr=? ORDER BY variant;
+END
+
+my $sth_near_gene_start = $dbh->prepare(<<'END') // die "Unable to prepare near gene start statement: ".$dbh->errstr;
+SELECT cds.start AS start,
+       cds.stop AS stop,
+       cds.gene AS symbol,
+       gitn.gene_name AS description
+    FROM cds
+      LEFT OUTER JOIN geneidtoname gitn ON cds.geneid=gitn.gene_id
+    WHERE cds.chr = ? AND cds.start > ?
+    ORDER BY cds.start ASC LIMIT 1;
+END
+
+my $sth_near_gene_stop = $dbh->prepare(<<'END') // die "Unable to prepare near gene stop statement: ".$dbh->errstr;
+SELECT cds.start AS start,
+       cds.stop AS stop,
+       cds.gene AS symbol,
+       gitn.gene_name AS description
+    FROM cds
+      LEFT OUTER JOIN geneidtoname gitn ON cds.geneid=gitn.gene_id
+    WHERE cds.chr = ? AND cds.stop < ?
+    ORDER BY cds.stop DESC LIMIT 1;
+END
+
+my $prot_table;
+
+while (<DATA>) {
+    chomp;
+    my ($codon,$aa,$aa_med,$aa_long) = split /\t/;
+    $prot_table->{$codon} = $aa;
+}
+
+my @headers = qw(id chr pos ref alt);
+my @row_keys_to_initialize = qw(gene nonsyn aa_ref aa_alt codon_ref codon_alt cdspos protpos loc num var acc phase reffreq altfreq refaltn);
+print join("\t",@headers,'orig_id',@row_keys_to_initialize)."\n";
+while (<>) {
+    print $_ and next if /^#/;
+    chomp;
+    my @row = split /\t/, $_;
+    my %row;
+    @row{@headers} = @row[0..$#headers];
+    $row{orig_id} = $row{id};
+    # this is taken directly from vcf_fill_rsid; we want it for the time being anyway
+    if ($row{id} !~ /^(?:rs|ss)?\d+(?:,\s*(?:rs|ss)?\d+)*$/) {
+       my $new_id = find_rsid(chr => fixup_chr($row{chr}),
+                              # snps are 0 indexed
+                              pos => $row{pos} - 1,
+                              dbh => $dbh,
+                              sth => $sth,
+                             );
+       if ($new_id !~ /^(?:[\.\?])$/) {
+           $row{id} = "rs".$new_id;
+       }
+    }
+    if ($row{id} =~ /(rs|ss)(\d+)/) {
+       my $type = $1;
+       my $id = $2;
+       $id = find_real_snp_id(id => $id,
+                              dbh => $dbh,
+                              sth => $real_snp_sth,
+                             );
+       if (not defined $id and
+           is_retired_snp(id  => $id,
+                          dbh => $dbh,
+                          sth => $sth_snp_history,
+                         )) {
+           if (defined $row{chr} and defined $row{pos}) {
+               $row{id} = "$row{chr}-$row{pos}";
+           }
+       }
+       else {
+           $row{id} = "$type$id";
+       }
+    }
+
+    if (defined $row{id} and ($options{fix_pos} or
+                             not defined $row{pos} or
+                             not defined $row{chr} or
+                             not defined $row{ref} or
+                             not defined $row{alt})) {
+       my %info = find_chr_pos(id => [$row{id} =~ /(?:rs|ss)(\d+)/]->[0],
+                               dbh => $dbh,
+                               sth => $snp_chr_pos_sth,
+                              );
+       for my $inf (qw(pos chr ref alt)) {
+           $row{$inf} = $info{$inf} if $options{fix_pos} and defined $info{$inf} or not defined $row{$inf};
+       }
+       # handle partially aligned snps
+       if ($options{fix_pos} and not defined $row{pos}) {
+           $row{pos} = $info{pos2};
+       }
+       if (not defined $row{pos} or not defined $row{chr}) {
+           if (defined $row{pos}) {
+               print STDERR "$row{id} is ambiguously placed\n";
+           }
+           else {
+               print STDERR "Unable to find a position/chr for $row{id}\n";
+           }
+       }
+    }
+    my $mrna_information = mrna_information(dbh => $dbh,
+                                           sth => $sth_mrna_ss,
+                                           # these are all 1 indexed
+                                           pos => $row{pos},
+                                           chr => fixup_chr($row{chr}),
+                                           ref => $row{ref},
+                                           alt => $row{alt},
+                                           cds_sth => $sth_cds_ss,
+                                           mrna_cds_sth => $sth_mrna_cds,
+                                          );
+    @row{@row_keys_to_initialize} = ('') x @row_keys_to_initialize;
+    if (keys %{$mrna_information}) {
+       # strip out existing information in $row{info}
+       my @extra_info;
+       $row{gene}     = $mrna_information->{gene};
+       $row{nonsyn}   = 1 if defined $mrna_information->{nonsyn} and $mrna_information->{nonsyn};
+       $row{aa_ref}    = $mrna_information->{orig_aa} if defined $mrna_information->{orig_aa};
+       $row{aa_alt}    = $mrna_information->{new_aa} if defined $mrna_information->{new_aa};
+       $row{codon_ref} = $mrna_information->{orig_codon} if defined $mrna_information->{orig_codon};
+       $row{codon_alt} = $mrna_information->{new_codon} if defined $mrna_information->{new_codon};
+       $row{cdspos}   = $mrna_information->{cds_pos} if defined $mrna_information->{cds_pos};
+       $row{protpos}  = $mrna_information->{prot_pos} if defined $mrna_information->{prot_pos};
+       $row{loc}      = $mrna_information->{exon}?'exon':'intron' if defined $mrna_information->{gene};
+       $row{num}      = $mrna_information->{number};
+       $row{var}      = $mrna_information->{variant} if defined $mrna_information->{variant};
+       $row{acc}      = $mrna_information->{accession};
+       $row{phase}    = $mrna_information->{phase} if defined $mrna_information->{phase};
+    }
+    if ((not defined $row{gene} or not length $row{gene})
+       and $options{nearest_gene}) {
+       $row{gene} = find_nearest_gene(dbh => $dbh,
+                                      start_sth => $sth_near_gene_start,
+                                      stop_sth => $sth_near_gene_stop,
+                                      chr => fixup_chr($row{chr}),
+                                      pos => $row{pos},
+                                     );
+    }
+    my $allele_freq_information = allele_freq_information(dbh => $dbh,
+                                                         sth => $sth_allele_freq,
+                                                         id  => [$row{id} =~ /(?:rs|ss)(\d+)/]->[0],
+                                                         # these are all 1 indexed
+                                                         pos => $row{pos},
+                                                         chr => fixup_chr($row{chr}),
+                                                         ref => $row{ref},
+                                                         alt => $row{alt},
+                                                        ) if defined $row{id} and $row{id} =~ /^(?:rs|ss)/;
+    if (keys %{$allele_freq_information}) {
+       # strip out existing information in $row{info}
+       if (exists $allele_freq_information->{alleles} and keys %{$allele_freq_information->{alleles}}) {
+           $row{reffreq} = sprintf('%.3f',allele_freq($allele_freq_information->{alleles},$row{ref}));
+           $row{altfreq} = sprintf('%.3f',allele_freq($allele_freq_information->{alleles},$row{alt}));
+           $row{refaltn} = $allele_freq_information->{alleles}{total};
+       }
+    }
+    print join("\t",map {defined $_?$_:'NA'} @row{@headers,'orig_id',@row_keys_to_initialize},$#row > $#headers?@row[@headers..$#row]:())."\n";
+    if ($options{verbose} > 0 and $. % 100 == 0) {
+       print STDERR "At line $.\n";
+    }
+}
+
+sub find_real_snp_id {
+    my %param = @_;
+
+    my %info;
+    my $rv = $param{sth}->execute($param{id},$param{id}) //
+       die "Unable to execute statement properly: ".$param{dbh}->errstr;
+    my ($real_id) = map {ref $_ ?@{$_}:()} map {ref $_ ?@{$_}:()} $param{sth}->fetchall_arrayref([0]);
+    if ($param{sth}->err) {
+       print STDERR $param{sth}->errstr;
+       $param{sth}->finish;
+       return $param{id};
+    }
+    $param{sth}->finish;
+    return $real_id // $param{id};
+}
+
+sub find_chr_pos {
+    my %param = @_;
+
+    my %info;
+    my $rv = $param{sth}->execute($param{id}) //
+       die "Unable to execute statement properly: ".$param{dbh}->errstr;
+    my $results = $param{sth}->fetchall_arrayref({});
+    if ($param{sth}->err) {
+       print STDERR $param{sth}->errstr;
+       $param{sth}->finish;
+       return %info;
+    }
+    for my $result (@{$results}) {
+       %info = %{$result};
+    }
+    if (keys %info) {
+       if ($info{chr} eq 'NotOn') {
+           undef $info{chr};
+       }
+       if ($info{orien}) {
+           $info{orien_var_str} = $info{rev_var_str};
+       }
+       else {
+           $info{orien_var_str} = $info{var_str};
+       }
+       my @alleles = split qr{/},$info{orien_var_str};
+       ($info{alt}) = grep {$_ ne $info{ref}} @alleles;
+       # fix $row{pos}
+       $info{pos}+=1;
+    }
+    $param{sth}->finish;
+    return %info;
+}
+
+sub find_nearest_gene {
+    my %param = @_;
+    return undef unless defined $param{chr} and defined $param{pos};
+
+    my %possible_genes;
+
+    for my $st (qw(start stop)) {
+       my $rv = $param{"${st}_sth"}->execute($param{chr},$param{pos}) //
+           die "Unable to execute nearest gene $st statement properly: ".$param{dbh}->errstr;
+       my $results = $param{"${st}_sth"}->fetchall_arrayref({});
+       if ($param{"${st}_sth"}->err) {
+           print STDERR $param{"${st}_sth"}->errstr;
+           $param{"${st}_sth"}->finish;
+           return '';
+       }
+       $possible_genes{$st} = $results->[0]
+    }
+    for my $st (qw(start stop)) {
+       # opposite of start/stop (stop/start)
+       my $st_op = $st eq q(start) ? q(stop) : q(start);
+       if (not defined $possible_genes{$st}{start}) {
+           if (defined $possible_genes{$st_op}{start}) {
+               return $possible_genes{$st_op}{symbol}
+           }
+           else {
+               return undef;
+           }
+       }
+    }
+    my $gene;
+    # if the start of the start gene is closer to the position of the
+    # snp than the stop of the stop gene, it's the nearest gene;
+    # otherwise, the stop gene is
+    if ($possible_genes{start}{start} - $param{pos} <
+       $param{pos} - $possible_genes{stop}{stop}
+       ) {
+       return $possible_genes{start}{symbol};
+    }
+    else {
+       return $possible_genes{stop}{symbol};
+    }
+}
+
+
+sub mrna_information{
+    my %param = @_;
+
+    my $rv = $param{sth}->execute($param{pos},$param{pos},$param{chr}) or
+       die "Unable to execute statement properly: ".$param{dbh}->errstr;
+
+    my $results = $param{sth}->fetchall_arrayref({});
+    if ($param{sth}->err) {
+       print STDERR $param{sth}->errstr;
+       return {};
+    }
+    my $chosen_result;
+    for my $result (@{$results}) {
+       if (not defined $chosen_result) {
+           $chosen_result = $result;
+           next;
+       }
+       if (not defined $chosen_result->{variant} or
+           (defined $result->{variant} and
+            $chosen_result->{variant} > $result->{variant})) {
+           $chosen_result = $result;
+           next;
+       }
+    }
+    if (defined $chosen_result and $chosen_result->{exon}) {
+       # try to figure out the cds if there is mrna information and
+       # we're in an exon
+       my $cds_rv = $param{cds_sth}->execute($param{pos},$param{pos},$param{chr}) or
+           die "Unable to execute statement properly: ".$param{dbh}->errstr;
+       my $cds_results = $param{cds_sth}->fetchall_arrayref({});
+       if ($param{cds_sth}->err) {
+           print STDERR $param{cds_sth}->errstr;
+       }
+       else {
+           print STDERR Data::Dumper->Dump([$cds_results],[qw(cds_results)]) if $DEBUG;
+           my $chosen_cds;
+           for my $cds_result (@{$cds_results}) {
+               if (not defined $chosen_cds) {
+                   $chosen_cds = $cds_result;
+                   next;
+               }
+               if (not defined $cds_result->{variant} or
+                   (defined $chosen_cds->{variant} and
+                    $chosen_cds->{variant} ge $cds_result->{variant})) {
+                   $chosen_cds = $cds_result;
+                   next;
+               }
+           }
+           if (defined $chosen_cds and keys %{$chosen_cds}) {
+               my $comp_offset = $chosen_cds->{complement}?-1:1;
+               $chosen_result->{in_cds} = 1;
+               $chosen_result->{cds_pos} =
+                   ($chosen_cds->{complement}?$chosen_cds->{stop}-$param{pos}:$param{pos}-$chosen_cds->{start}) +
+                       $chosen_cds->{position};
+               $chosen_result->{gene} = $chosen_cds->{gene};
+               $chosen_result->{pos} = $chosen_cds->{pos};
+               $chosen_result->{number} = $chosen_cds->{number};
+               $chosen_result->{variant} = $chosen_cds->{variant};
+               $chosen_result->{acc} = $chosen_cds->{accession};
+               $chosen_result->{prot_pos} = sprintf("%.1f",$chosen_result->{cds_pos}/ 3);
+               $chosen_result->{phase} = ($chosen_cds->{phase} + $param{pos} - $chosen_cds->{start}) % 3;
+               print STDERR Data::Dumper->Dump([$chosen_cds,$chosen_result],[qw(chosen_cds chosen_result)]) if $DEBUG;
+               my $mrna_cds_rv = $param{mrna_cds_sth}->execute($chosen_cds->{gi}) or
+                   die "Unable to execute statement properly: ".$param{dbh}->errstr;
+               my $mrna_cds_result = $param{mrna_cds_sth}->fetchrow_hashref();
+               if (defined $mrna_cds_result and keys %{$mrna_cds_result}) {
+                   print STDERR Data::Dumper->Dump([$mrna_cds_result,$chosen_result,\%param],
+                                                   [qw(mrna_cds_result chosen_result *param)]) if $DEBUG;
+                   my $codon = substr $mrna_cds_result->{sequence},$chosen_result->{cds_pos} - (($chosen_result->{cds_pos}) % 3)  ,3;
+                   my $orig_aa = convert_to_prot_seq($codon);
+                   my $modified_codon = $codon;
+                   substr($modified_codon,$chosen_result->{cds_pos} % 3,1)=
+                       ($chosen_cds->{complement}?dna_complement($param{alt}):$param{alt});
+                   my $new_aa = convert_to_prot_seq($modified_codon);
+                   print STDERR Dumper($mrna_cds_result,$chosen_result,$chosen_cds,{codon_pos => $chosen_result->{cds_pos} - ($chosen_result->{cds_pos} % 3),
+                                                                                    codon=>$codon,modified_codon=>$modified_codon,pos=>$param{pos}}) if $DEBUG;
+                   if ($DEBUG > 1) {
+                       my $a = -1;
+                       print STDERR map {$a++; ($a*3)." $_\n"} $mrna_cds_result->{sequence} =~ m/.../g;
+                   }
+                   $chosen_result->{orig_aa} = $orig_aa;
+                   $chosen_result->{new_aa} = $new_aa;
+                   $chosen_result->{new_codon} = $modified_codon;
+                   $chosen_result->{orig_codon} = $codon;
+                   $chosen_result->{nonsyn} = $new_aa ne $orig_aa ? 1 : 0;
+               }
+           }
+       }
+    }
+    return $chosen_result // {};
+}
+
+
+sub fixup_chr {
+    my ($chr) = @_;
+    return $chr if not defined $chr;
+
+    $chr =~ s/^NC_00000?//;
+    $chr =~ s/^chr//;
+    if (looks_like_number($chr)) {
+       if ($chr == 23) {
+           $chr = "X"
+       }
+       elsif ($chr == 24) {
+           $chr = "Y"
+       }
+    }
+    return ($chr);
+}
+
+sub allele_freq_information{
+    my %param = @_;
+
+    return {} if $param{id} !~ /^\d+$/;
+
+    my $rv = $param{sth}->execute($param{id},$param{chr}) or
+       die "Unable to execute statement properly: ".$param{dbh}->errstr." id:".$param{id}." chr:".$param{chr};
+
+    my $results = $param{sth}->fetchall_arrayref({});
+    if ($param{sth}->err) {
+       print STDERR $param{sth}->errstr;
+       return {};
+    }
+    my %pop_alleles;
+    my %alleles;
+    my %seen_indiv;
+    my %indivs;
+    for my $result (@{$results}) {
+       next if exists $seen_indiv{$result->{submitted_ind_id}.'.'.$result->{chr_num}};
+       $indivs{$result->{submitted_ind_id}}{'chr_'.$result->{chr_num}} = $result->{allele};
+       if (defined $result->{loc_pop_id}) {
+           $indivs{$result->{submitted_ind_id}}{loc_pop_id} = $result->{loc_pop_id};
+           $indivs{$result->{submitted_ind_id}}{pop_handle} = $result->{pop_handle};
+           $pop_alleles{$result->{loc_pop_id}}{$result->{allele}}++;
+           $pop_alleles{$result->{loc_pop_id}}{total}++;
+       }
+       $alleles{$result->{allele}}++;
+       $alleles{total}++;
+       $seen_indiv{$result->{submitted_ind_id}.'.'.$result->{chr_num}} = 1;
+    }
+    my %genotypes;
+    my %pop_genotypes;
+    for my $indiv (values %indivs) {
+       if (defined $indiv->{chr_1} and defined $indiv->{chr_2}) {
+           my $genotype = join('/',
+                               sort($indiv->{chr_1},
+                                    $indiv->{chr_2}));
+           if (defined $indiv->{loc_pop_id}) {
+               $pop_genotypes{$indiv->{loc_pop_id}}{$genotype}++;
+               $pop_genotypes{$indiv->{loc_pop_id}}{total}++;
+           }
+           $genotypes{$genotype}++;
+           $genotypes{total}++;
+       }
+    }
+    return {pop_alleles   => \%pop_alleles,
+           pop_genotypes => \%pop_genotypes,
+           genotypes     => \%genotypes,
+           alleles       => \%alleles,
+          };
+}
+
+sub allele_freq{
+    my ($freq_table,$allele) = @_;
+    my $flipped_allele = $allele;
+    $flipped_allele =~ tr/ACTG/TGAC/;
+    if (defined $allele) {
+       if (exists $freq_table->{$allele} and defined $freq_table->{$allele}) {
+           return $freq_table->{$allele}/$freq_table->{total};
+       }
+       elsif (exists $freq_table->{$flipped_allele} and defined $freq_table->{$flipped_allele}) {
+           return $freq_table->{$flipped_allele}/$freq_table->{total};
+       }
+       return 0;
+    }
+    my @alleles = grep {$_ ne 'total'} keys %{$freq_table};
+    return join(',',map {$_.':'.(defined $freq_table->{$_} ?
+                                $freq_table->{$_}/$freq_table->{total} : 0)} @alleles);
+}
+
+
+sub find_rsid {
+    my %param = @_;
+
+    my $rv = $param{sth}->execute($param{chr},$param{pos}) //
+       die "Unable to execute statement properly: ".$param{dbh}->errstr;
+    my @snp_ids = grep {$_ =~ /^\d+$/}
+       map {ref $_ eq 'ARRAY'? @{$_}:$_}
+           map {ref $_ eq 'ARRAY'? @{$_}:$_}
+               $sth->fetchall_arrayref([0]);
+    if ($sth->err) {
+       print STDERR $sth->errstr;
+       return '?';
+    }
+    if (not @snp_ids) {
+       return '.';
+    }
+    return join(',',@snp_ids);
+}
+
+sub dna_complement{
+    my ($seq) = @_;
+    return join('',map {tr /ATCG/TAGC/; $_} reverse split //,$seq);
+}
+
+sub convert_to_prot_seq{
+    my ($mrna) = @_;
+    $mrna =~ s/U/T/g;
+    my $prot = '';
+    for my $pos (0..(length($mrna) / 3 - 1)) {
+       my $codon = substr($mrna,$pos*3,3);
+       if (not exists $prot_table->{$codon}) {
+           # this isn't quite right, as we should really be handling
+           # ambiguities which do not result in different amino
+           # acids, but for the time being, do it this way.
+           $prot .= 'N';
+       }
+       else {
+           $prot .= $prot_table->{$codon};
+       }
+    }
+    return $prot;
+}
+
+
+__DATA__
+ATT    I       Ile     Isoleucine
+ATC    I       Ile     Isoleucine
+ATA    I       Ile     Isoleucine
+CTT    L       Leu     Leucine
+CTC    L       Leu     Leucine
+CTA    L       Leu     Leucine
+CTG    L       Leu     Leucine
+TTA    L       Leu     Leucine
+TTG    L       Leu     Leucine
+GTT    V       Val     Valine
+GTC    V       Val     Valine
+GTA    V       Val     Valine
+GTG    V       Val     Valine
+TTT    F       Phe     Phenylalanine
+TTC    F       Phe     Phenylalanine
+ATG    M       Met     Methionine
+TGT    C       Cys     Cysteine
+TGC    C       Cys     Cysteine
+GCT    A       Ala     Alanine
+GCC    A       Ala     Alanine
+GCA    A       Ala     Alanine
+GCG    A       Ala     Alanine
+GGT    G       Gly     Glycine
+GGC    G       Gly     Glycine
+GGA    G       Gly     Glycine
+GGG    G       Gly     Glycine
+CCT    P       Pro     Proline
+CCC    P       Pro     Proline
+CCA    P       Pro     Proline
+CCG    P       Pro     Proline
+ACT    T       Thr     Threonine
+ACC    T       Thr     Threonine
+ACA    T       Thr     Threonine
+ACG    T       Thr     Threonine
+TCT    S       Ser     Serine
+TCC    S       Ser     Serine
+TCA    S       Ser     Serine
+TCG    S       Ser     Serine
+AGT    S       Ser     Serine
+AGC    S       Ser     Serine
+TAT    Y       Tyr     Tyrosine
+TAC    Y       Tyr     Tyrosine
+TGG    W       Trp     Tryptophan
+CAA    Q       Gln     Glutamine
+CAG    Q       Gln     Glutamine
+AAT    N       Asn     Asparagine
+AAC    N       Asn     Asparagine
+CAT    H       His     Histidine
+CAC    H       His     Histidine
+GAA    E       Glu     Glutamic acid
+GAG    E       Glu     Glutamic acid
+GAT    D       Asp     Aspartic acid
+GAC    D       Asp     Aspartic acid
+AAA    K       Lys     Lysine
+AAG    K       Lys     Lysine
+CGT    R       Arg     Arginine
+CGC    R       Arg     Arginine
+CGA    R       Arg     Arginine
+CGG    R       Arg     Arginine
+AGA    R       Arg     Arginine
+AGG    R       Arg     Arginine
+TAA    *       *       Stop codon
+TAG    *       *       Stop codon
+TGA    *       *       Stop codon
diff --git a/bin/snp_study_table b/bin/snp_study_table
new file mode 100755 (executable)
index 0000000..6a8d887
--- /dev/null
@@ -0,0 +1,200 @@
+#! /usr/bin/perl
+# snp_study_table outputs a table of study results, and is released
+# under the terms of the GPL version 2, or any later version, at your
+# option. See the file README and COPYING for more information.
+# Copyright 2011 by Don Armstrong <don@donarmstrong.com>.
+# $Id: perl_script 1825 2011-01-02 01:53:43Z don $
+
+
+use warnings;
+use strict;
+
+use Getopt::Long;
+use Pod::Usage;
+
+=head1 NAME
+
+snp_study_table - output an xls table of study results
+
+=head1 SYNOPSIS
+
+snp_study_table [options] [snps]
+
+ Options:
+  --output output filename (Default STDOUT)
+  --debug, -d debugging level (Default 0)
+  --help, -h display this help
+  --man, -m display manual
+
+=head1 SNPS
+
+SNPS can be specified as rs12345 or as chr-pos; in the later case, the
+ga_chr_pos table will be used, in the former, the ga_snp table will be
+used.
+
+=head1 OPTIONS
+
+=over
+
+=item B<--output>
+
+Output filename (defaults to STDOUT)
+
+=item B<--debug, -d>
+
+Debug verbosity. (Default 0)
+
+=item B<--help, -h>
+
+Display brief usage information.
+
+=item B<--man, -m>
+
+Display this manual.
+
+=back
+
+=head1 EXAMPLES
+
+
+=cut
+
+
+use vars qw($DEBUG);
+
+use DBI;
+use Spreadsheet::WriteExcel;
+
+my %options = (debug           => 0,
+              help            => 0,
+              man             => 0,
+              );
+
+GetOptions(\%options,
+          'output|o=s',
+          'debug|d+','help|h|?','man|m');
+
+pod2usage() if $options{help};
+pod2usage({verbose=>2}) if $options{man};
+
+$DEBUG = $options{debug};
+
+my @USAGE_ERRORS;
+
+pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
+
+my $out_fh = \*STDOUT;
+if (exists $options{output}) {
+    $out_fh = IO::File->new($options{output},'w') or
+       die "Unable to open $options{output} for writing: $!";
+}
+my $wb = Spreadsheet::WriteExcel->new($out_fh);
+my $ws = $wb->add_worksheet('snp_results');
+
+my $dbh = DBI->connect("dbi:Pg:service=snp",'','',{AutoCommit => 0}) or
+    die "Unable to connect to database: ".$DBI::errstr;
+
+my $sth_ga_info = $dbh->prepare(<<'END') // die "Unable to prepare ga_snp info statement: ".$dbh->errstr;
+SELECT * FROM ga_snp WHERE snp_id=?;
+END
+
+my $sth_ga_cp_info = $dbh->prepare(<<'END') // die "Unable to prepare ga_snp info statement: ".$dbh->errstr;
+SELECT * FROM ga_chr_pos WHERE chr=? AND pos=?;
+END
+
+
+if (not @ARGV) {
+    while (<STDIN>) {
+       next if /^#/;
+       chomp;
+       push @ARGV,$_;
+    }
+}
+
+my %studies;
+my %snps;
+for my $snp (@ARGV) {
+    my $results;
+    if ($snp =~ /^(?:rs)(\d+)$/) {
+       $results = get_snp_results(dbh => $dbh,
+                                  sth => $sth_ga_info,
+                                  bind => [$1],
+                                 );
+    }
+    elsif ($snp =~ /^(\d+)-(\d+)$/) {
+       $results = get_snp_results(dbh => $dbh,
+                                  sth => $sth_ga_cp_info,
+                                  bind => [$1,$2],
+                                 );
+    }
+    else {
+       print STDERR "Invalid snp format '$snp'\n";
+       next;
+    }
+    $snps{$snp} = $results;
+    for my $result (@{$results}) {
+       $studies{$result->{study_name}}{$result->{subpart_name}} = 1;
+    }
+}
+$dbh->disconnect();
+
+#column names
+my @cn = ('A'..'Z','AA'..'ZZ');
+
+my $r = 1;
+my $c = 0;
+my $f_center = $wb->add_format(align => 'center');
+$ws->write($cn[$c++].($r+2),'SNP');
+for my $study (sort keys %studies) {
+    my $n_substudy = keys %{$studies{$study}};
+    $ws->write($cn[$c].$r,$study);
+    $ws->merge_range($r-1,$c,$r-1,$c+$n_substudy*3-1,$study,$f_center);
+    for my $substudy (sort keys %{$studies{$study}}) {
+       $ws->write($cn[$c].($r+1),$substudy);
+       $ws->merge_range($r,$c,$r,$c+2,$substudy,$f_center);
+       $ws->write($cn[$c].($r+2),'P value');
+       $ws->write($cn[$c+1].($r+2),'Q value');
+       $ws->write($cn[$c+2].($r+2),'FDR');
+       $c+=3;
+    }
+}
+$r+=3;
+$c=0;
+for my $snp (sort keys %snps) {
+    $ws->write($cn[$c++].$r,$snp);
+    my %snp_studies;
+    for my $row (@{$snps{$snp}}) {
+       $snp_studies{$row->{study_name}}{$row->{subpart_name}} = [@{$row}{qw(pvalue qvalue fdr)}];
+    }
+    for my $study (sort keys %studies) {
+       for my $substudy (sort keys %{$studies{$study}}) {
+           if (not exists $snp_studies{$study} or
+               not exists $snp_studies{$study}{$substudy}
+              ) {
+               $c+=3;
+           }
+           else {
+               for (0..2) {
+                   $ws->write($cn[$c++].$r,defined $snp_studies{$study}{$substudy}[$_]?$snp_studies{$study}{$substudy}[$_]:'');
+               }
+           }
+       }
+    }
+    $r++;
+    $c=0;
+}
+
+sub get_snp_results{
+    my %param = @_;
+    my $rv = $param{sth}->execute(@{$param{bind}}) or
+       die "Unable to execute statement properly: ".$param{dbh}->errstr;
+    my $results = $param{sth}->fetchall_arrayref({});
+    if ($param{sth}->err) {
+       print STDERR $param{sth}->errstr;
+       return {};
+    }
+    return $results;
+}
+
+
+__END__
diff --git a/bin/vcf_add_snp_info b/bin/vcf_add_snp_info
new file mode 100755 (executable)
index 0000000..db3dcb6
--- /dev/null
@@ -0,0 +1,611 @@
+#! /usr/bin/perl
+# vcf_add_snp_info adds additional snp information to vcf files, and
+# is released under the terms of the GPL version 2, or any later
+# version, at your option. See the file README and COPYING for more
+# information.
+# Copyright 2011 by Don Armstrong <don@donarmstrong.com>.
+
+
+use warnings;
+use strict;
+
+BEGIN {$|=1};
+
+use Getopt::Long;
+use Pod::Usage;
+
+=head1 NAME
+
+vcf_add_snp_info - Add additional snp information to vcf files
+
+=head1 SYNOPSIS
+
+ [options]
+
+ Options:
+  --debug, -d debugging level (Default 0)
+  --help, -h display this help
+  --man, -m display manual
+
+=head1 OPTIONS
+
+=over
+
+=item B<--debug, -d>
+
+Debug verbosity. (Default 0)
+
+=item B<--help, -h>
+
+Display brief usage information.
+
+=item B<--man, -m>
+
+Display this manual.
+
+=back
+
+=head1 EXAMPLES
+
+vcf_fill_rsid foo.vcf > bar.vcf
+
+=cut
+
+
+use vars qw($DEBUG);
+use DBI;
+use Data::Dumper;
+use Scalar::Util qw(looks_like_number);
+
+
+my %options = (debug           => 0,
+              help            => 0,
+              man             => 0,
+              verbose         => 0,
+              build           => '135',
+              refseq          => '37_3',
+              );
+
+GetOptions(\%options,
+          'build',
+          'refseq',
+          'verbose|v+',
+          'debug|d+','help|h|?','man|m');
+
+pod2usage() if $options{help};
+pod2usage({verbose=>2}) if $options{man};
+
+$DEBUG = $options{debug};
+
+my @USAGE_ERRORS;
+# if (1) {
+#      push @USAGE_ERRORS,"You must pass something";
+# }
+
+pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
+
+
+my $dbh = DBI->connect("dbi:Pg:service=snp",'','',{AutoCommit => 0}) or
+    die "Unable to connect to database: ".$DBI::errstr;
+
+my $sth = $dbh->prepare(<<"END") // die "Unable to prepare snpinfo statement: ".$dbh->errstr;
+SELECT snp_id FROM b$options{build}_snpchrposonref_$options{refseq} WHERE chr=? AND pos=?;
+END
+
+my $real_snp_sth = $dbh->prepare(<<'END') // die "Unable to prepare realsnp statement: ".$dbh->errstr;
+SELECT DISTINCT rscurrent FROM rsmergearch WHERE rslow=? OR rshigh=?
+END
+
+my $sth_mrna_ss = $dbh->prepare(<<'END') // die "Unable to prepare mrna_ss statement: ".$dbh->errstr;
+SELECT * FROM mrna WHERE start<=? AND stop>=? AND chr=? ORDER BY variant;
+END
+
+my $sth_cds_ss = $dbh->prepare(<<'END') // die "Unable to prepare cds_ss statement: ".$dbh->errstr;
+SELECT * FROM cds WHERE start<=? AND stop>=? AND chr=? ORDER BY variant;
+END
+
+my $sth_mrna_cds = $dbh->prepare(<<'END') // die "Unable to prepare mrna_cds statement: ".$dbh->errstr;
+SELECT * FROM mrna_cds_table WHERE cds_gi=?;
+END
+
+my $sth_snp_history = $dbh->prepare(<<'END') // die "Unable to prepare snp_history statement: ".$dbh->errstr;
+SELECT * FROM snphistory WHERE snp_id=?;
+END
+
+
+my $sth_allele_freq = $dbh->prepare(<<'END') // die "Unable to prepare allele_freq statement: ".$dbh->errstr;
+SELECT ssl.snp_id AS snp_id,
+       si.subsnp_id AS subsnp_id,
+--        ssl.substrand_reversed_flag,
+       si.chr AS chr,
+       si.submitted_ind_id AS submitted_ind_id,
+       ga.chr_num AS chr_num,
+--       si.submitted_strand_code,
+--       si.allele_flag,
+       ga.rev_flag AS rev_flag,
+       a.allele AS allele,
+       ra.allele AS rev_allele,
+       p.handle AS pop_handle,
+       p.loc_pop_id AS loc_pop_id
+  FROM snpsubsnplink ssl
+    JOIN subind si on ssl.subsnp_id=si.subsnp_id
+    JOIN gtyallele ga on si.gty_id=ga.gty_id
+    JOIN allele a on ga.fwd_allele_id = a.allele_id
+    JOIN allele ra on a.rev_allele_id = ra.allele_id
+    JOIN submittedindividual sind on si.submitted_ind_id=sind.submitted_ind_id
+    JOIN population p on sind.pop_id=p.pop_id
+    WHERE ssl.snp_id=? AND
+          si.chr=? AND ga.rev_flag = '0' AND
+          a.allele <> 'N';
+END
+
+my $sth_rsid_info = $dbh->prepare(<<'END') // die "Unable to prepare rsid info statement: ".$dbh->errstr;
+SELECT * FROM cds WHERE start<=? AND stop>=? AND chr=? ORDER BY variant;
+END
+
+my $sth_ga_info = $dbh->prepare(<<'END') // die "Unable to prepare ga_snp info statement: ".$dbh->errstr;
+SELECT * FROM ga_snp WHERE snp_id=?;
+END
+
+my $prot_table;
+
+while (<DATA>) {
+    chomp;
+    my ($codon,$aa,$aa_med,$aa_long) = split /\t/;
+    $prot_table->{$codon} = $aa;
+}
+
+my @headers = qw(chr pos id ref alt qual filter info format);
+while (<>) {
+    print $_ and next if /^#/;
+    chomp;
+    my @row = split /\t/, $_;
+    my %row;
+    @row{@headers} = @row[0..$#headers];
+    # this is taken directly from vcf_fill_rsid; we want it for the time being anyway
+    if ($row{id} !~ /^(?:rs|ss)?\d+(?:,\s*(?:rs|ss)?\d+)*$/) {
+       $row{id} = find_rsid(chr => fixup_chr($row{chr}),
+                            # snps are 0 indexed
+                            pos => $row{pos} - 1,
+                            dbh => $dbh,
+                            sth => $sth,
+                           );
+    }
+    if ($row{id} =~ /(rs|ss)(\d+)/) {
+       my $type = $1;
+       my $id = $2;
+       $id = find_real_snp_id(id => $id,
+                              dbh => $dbh,
+                              sth => $real_snp_sth,
+                             );
+       if (not defined $id and
+           is_retired_snp(id  => $id,
+                          dbh => $dbh,
+                          sth => $sth_snp_history,
+                         )) {
+           if (defined $row{chr} and defined $row{pos}) {
+               $row{id} = undef;
+           }
+       }
+       else {
+           $row{id} = "$type$id";
+       }
+    }
+    my $mrna_information = mrna_information(dbh => $dbh,
+                                           sth => $sth_mrna_ss,
+                                           # these are all 1 indexed
+                                           pos => $row{pos},
+                                           chr => fixup_chr($row{chr}),
+                                           ref => $row{ref},
+                                           alt => $row{alt},
+                                           cds_sth => $sth_cds_ss,
+                                           mrna_cds_sth => $sth_mrna_cds,
+                                          );
+    if (keys %{$mrna_information}) {
+       # strip out existing information in $row{info}
+       $row{info} =~ s{(?:GENE|(?:CDS|PROT|)POS|NUM|VAR|ACC|NONSYN|(?:ALT|REF)(?:AA|CODON))\=(?:[^\;]+|"[^"]+")(?:;|$)}{}g;
+       my @extra_info;
+       push @extra_info,"GENE=".$mrna_information->{gene};
+       push @extra_info,"NONSYN=".1 if defined $mrna_information->{nonsyn} and $mrna_information->{nonsyn};
+       push @extra_info,"REFAA=".$mrna_information->{orig_aa} if defined $mrna_information->{orig_aa};
+       push @extra_info,"ALTAA=".$mrna_information->{new_aa} if defined $mrna_information->{new_aa};
+       push @extra_info,"REFCODON=".$mrna_information->{orig_codon} if defined $mrna_information->{orig_codon};
+       push @extra_info,"ALTCODON=".$mrna_information->{new_codon} if defined $mrna_information->{new_codon};
+       push @extra_info,"CDSPOS=".$mrna_information->{cds_pos} if defined $mrna_information->{cds_pos};
+       push @extra_info,"PROTPOS=".$mrna_information->{prot_pos} if defined $mrna_information->{prot_pos};
+       push @extra_info,"POS=".($mrna_information->{exon}?'exon':'intron');
+       push @extra_info,"NUM=".$mrna_information->{number};
+       push @extra_info,"VAR=".$mrna_information->{variant} if defined $mrna_information->{variant};
+       push @extra_info,"ACC=".$mrna_information->{accession};
+       push @extra_info,"ACCVER=".$mrna_information->{accession_ver};
+       push @extra_info,"PHASE=".$mrna_information->{phase} if defined $mrna_information->{phase};
+       $row{info} = join(';',@extra_info,$row{info});
+    }
+    my $allele_freq_information = allele_freq_information(dbh => $dbh,
+                                                         sth => $sth_allele_freq,
+                                                         id  => $row{id},
+                                                         # these are all 1 indexed
+                                                         pos => $row{pos},
+                                                         chr => fixup_chr($row{chr}),
+                                                         ref => $row{ref},
+                                                         alt => $row{alt},
+                                                        );
+    if (keys %{$allele_freq_information}) {
+       # strip out existing information in $row{info}
+       $row{info} =~ s{(?:REFFREQ|ALTFREQ|REFALTN)\=(?:[^\;]+|"[^"]+")(?:;|$)}{}g;
+       my @extra_info;
+       if (exists $allele_freq_information->{alleles} and keys %{$allele_freq_information->{alleles}}) {
+           push @extra_info,"REFFREQ=".sprintf('%.3f',allele_freq($allele_freq_information->{alleles},$row{ref}));
+           push @extra_info,"ALTFREQ=".sprintf('%.3f',allele_freq($allele_freq_information->{alleles},$row{alt}));
+           push @extra_info,"REFALTN=".$allele_freq_information->{alleles}{total};
+       }
+       $row{info} = join(';',@extra_info,$row{info});
+    }
+    my $ga_study_information = ga_study_information(dbh => $dbh,
+                                                   sth => $sth_ga_info,
+                                                   id  => $row{id},
+                                                  );
+    if (defined $ga_study_information) {
+       # strip out existing information in $row{info}
+       $row{info} =~ s{(?:GASTUDY|GASTUDYSIG)\=(?:[^\;]+|"[^"]+")(?:;|$)}{}g;
+       my @extra_info;
+       push @extra_info,"GASTUDY=".$ga_study_information->{studies};
+       push @extra_info,"GASTUDYSIG=".($ga_study_information->{significant}?'TRUE':'FALSE');
+       $row{info} = join(';',@extra_info,$row{info});
+    }
+    print join("\t",@row{@headers},$#row > $#headers?@row[@headers..$#row]:())."\n";
+    if ($options{verbose} > 0 and $. % 100 == 0) {
+       print STDERR "At line $.\n";
+    }
+}
+
+sub ga_study_information {
+    my %param = @_;
+
+    return undef unless $param{id} =~ /^\d+$/;
+    my $rv = $param{sth}->execute($param{id}) or
+       die "Unable to execute statement properly: ".$param{dbh}->errstr;
+    my $results = $param{sth}->fetchall_arrayref({});
+    if (not @{$results}) {
+       return undef;
+    }
+    my $any_significant=0;
+    my @values;
+    for my $result (sort {
+       if (looks_like_number($a->{pvalue})) {
+           if (looks_like_number($b->{pvalue})) {
+               $a->{pvalue} <=> $b->{pvalue};
+           } else {
+               -1;
+           }
+       } elsif (looks_like_number($b->{pvalue})) {
+           1;
+       } else {
+           0;
+       }} @{$results}) {
+       if ($result->{significant}) {
+           $any_significant=1;
+       }
+       my $value = $result->{study_name}.':'.$result->{subpart_name};
+       for my $t (qw(pvalue qvalue fdr)) {
+           next unless exists $result->{$t} and defined $result->{$t} and length $result->{$t};
+           my ($fl) = $t =~ m/^(.)/;
+           $value .= ":$fl:".sprintf('%.3g',$result->{$t});
+       }
+       push @values, $value;
+    }
+    return {significant => $any_significant,
+           studies     => join(',',@values),
+          };
+
+}
+
+sub mrna_information{
+    my %param = @_;
+
+    my $rv = $param{sth}->execute($param{pos},$param{pos},$param{chr}) or
+       die "Unable to execute statement properly: ".$param{dbh}->errstr;
+
+    my $results = $param{sth}->fetchall_arrayref({});
+    if ($param{sth}->err) {
+       print STDERR $param{sth}->errstr;
+       return {};
+    }
+    my $chosen_result;
+    for my $result (@{$results}) {
+       if (not defined $chosen_result) {
+           $chosen_result = $result;
+           next;
+       }
+       if (not defined $chosen_result->{variant} or
+           (defined $result->{variant} and
+            $chosen_result->{variant} > $result->{variant})) {
+           $chosen_result = $result;
+           next;
+       }
+    }
+    if (defined $chosen_result and $chosen_result->{exon}) {
+       # try to figure out the cds if there is mrna information and
+       # we're in an exon
+       my $cds_rv = $param{cds_sth}->execute($param{pos},$param{pos},$param{chr}) or
+           die "Unable to execute statement properly: ".$param{dbh}->errstr;
+       my $cds_results = $param{cds_sth}->fetchall_arrayref({});
+       if ($param{cds_sth}->err) {
+           print STDERR $param{cds_sth}->errstr;
+       }
+       else {
+           print STDERR Data::Dumper->Dump([$cds_results],[qw(cds_results)]) if $DEBUG;
+           my $chosen_cds;
+           for my $cds_result (@{$cds_results}) {
+               if (not defined $chosen_cds) {
+                   $chosen_cds = $cds_result;
+                   next;
+               }
+               if (not defined $cds_result->{variant} or
+                   (defined $chosen_cds->{variant} and
+                    $chosen_cds->{variant} ge $cds_result->{variant})) {
+                   $chosen_cds = $cds_result;
+                   next;
+               }
+           }
+           if (defined $chosen_cds and keys %{$chosen_cds}) {
+               my $comp_offset = $chosen_cds->{complement}?-1:1;
+               $chosen_result->{in_cds} = 1;
+               $chosen_result->{cds_pos} =
+                   ($chosen_cds->{complement}?$chosen_cds->{stop}-$param{pos}:$param{pos}-$chosen_cds->{start}) +
+                       $chosen_cds->{position};
+               $chosen_result->{gene} = $chosen_cds->{gene};
+               $chosen_result->{pos} = $chosen_cds->{pos};
+               $chosen_result->{number} = $chosen_cds->{number};
+               $chosen_result->{variant} = $chosen_cds->{variant};
+               $chosen_result->{acc} = $chosen_cds->{accession};
+               $chosen_result->{accver} = $chosen_cds->{accession_ver};
+               $chosen_result->{prot_pos} = sprintf("%.1f",$chosen_result->{cds_pos}/ 3);
+               $chosen_result->{phase} = ($chosen_cds->{phase} + $param{pos} - $chosen_cds->{start}) % 3;
+               print STDERR Data::Dumper->Dump([$chosen_cds,$chosen_result],[qw(chosen_cds chosen_result)]) if $DEBUG;
+               my $mrna_cds_rv = $param{mrna_cds_sth}->execute($chosen_cds->{gi}) or
+                   die "Unable to execute statement properly: ".$param{dbh}->errstr;
+               my $mrna_cds_result = $param{mrna_cds_sth}->fetchrow_hashref();
+               if (defined $mrna_cds_result and keys %{$mrna_cds_result}) {
+                   print STDERR Data::Dumper->Dump([$mrna_cds_result,$chosen_result,\%param],
+                                                   [qw(mrna_cds_result chosen_result *param)]) if $DEBUG;
+                   my $codon = substr $mrna_cds_result->{sequence},$chosen_result->{cds_pos} - (($chosen_result->{cds_pos}) % 3)  ,3;
+                   my $orig_aa = convert_to_prot_seq($codon);
+                   my $modified_codon = $codon;
+                   substr($modified_codon,$chosen_result->{cds_pos} % 3,1)=
+                       ($chosen_cds->{complement}?dna_complement($param{alt}):$param{alt});
+                   my $new_aa = convert_to_prot_seq($modified_codon);
+                   print STDERR Dumper($mrna_cds_result,$chosen_result,$chosen_cds,{codon_pos => $chosen_result->{cds_pos} - ($chosen_result->{cds_pos} % 3),
+                                                                                    codon=>$codon,modified_codon=>$modified_codon,pos=>$param{pos}}) if $DEBUG;
+                   if ($DEBUG > 1) {
+                       my $a = -1;
+                       print STDERR map {$a++; ($a*3)." $_\n"} $mrna_cds_result->{sequence} =~ m/.../g;
+                   }
+                   $chosen_result->{orig_aa} = $orig_aa;
+                   $chosen_result->{new_aa} = $new_aa;
+                   $chosen_result->{new_codon} = $modified_codon;
+                   $chosen_result->{orig_codon} = $codon;
+                   $chosen_result->{nonsyn} = $new_aa ne $orig_aa ? 1 : 0;
+               }
+           }
+       }
+    }
+    return $chosen_result // {};
+}
+
+sub find_real_snp_id {
+    my %param = @_;
+
+    my %info;
+    my $rv = $param{sth}->execute($param{id},$param{id}) //
+       die "Unable to execute statement properly: ".$param{dbh}->errstr;
+    my ($real_id) = map {ref $_ ?@{$_}:()} map {ref $_ ?@{$_}:()} $param{sth}->fetchall_arrayref([0]);
+    if ($param{sth}->err) {
+       print STDERR $param{sth}->errstr;
+       $param{sth}->finish;
+       return $param{id};
+    }
+    $param{sth}->finish;
+    return $real_id // $param{id};
+}
+
+sub is_retired_snp {
+    my %param = @_;
+
+    my %info;
+    my $rv = $param{sth}->execute($param{id}) //
+       die "Unable to execute statement properly: ".$param{dbh}->errstr;
+    if ($param{sth}->rows >= 1) {
+       return 1;
+    }
+    return 0;
+}
+
+
+sub fixup_chr {
+    my ($chr) = @_;
+
+    $chr =~ s/^NC_00000?//;
+    $chr =~ s/^chr//;
+    if (looks_like_number($chr)) {
+       if ($chr == 23) {
+           $chr = "X"
+       }
+       elsif ($chr == 24) {
+           $chr = "Y"
+       }
+    }
+    return ($chr);
+}
+
+sub allele_freq_information{
+    my %param = @_;
+
+    return {} if $param{id} !~ /^\d+$/;
+
+    my $rv = $param{sth}->execute($param{id},$param{chr}) or
+       die "Unable to execute statement properly: ".$param{dbh}->errstr." id:".$param{id}." chr:".$param{chr};
+
+    my $results = $param{sth}->fetchall_arrayref({});
+    if ($param{sth}->err) {
+       print STDERR $param{sth}->errstr;
+       return {};
+    }
+    my %pop_alleles;
+    my %alleles;
+    my %seen_indiv;
+    my %indivs;
+    for my $result (@{$results}) {
+       next if exists $seen_indiv{$result->{submitted_ind_id}.'.'.$result->{chr_num}};
+       $indivs{$result->{submitted_ind_id}}{pop_handle} = $result->{pop_handle};
+       $indivs{$result->{submitted_ind_id}}{loc_pop_id} = $result->{loc_pop_id};
+       $indivs{$result->{submitted_ind_id}}{'chr_'.$result->{chr_num}} = $result->{allele};
+       if (defined $result->{loc_pop_id}) {
+           $pop_alleles{$result->{loc_pop_id}}{$result->{allele}}++;
+           $pop_alleles{$result->{loc_pop_id}}{total}++;
+       }
+       $alleles{$result->{allele}}++;
+       $alleles{total}++;
+       $seen_indiv{$result->{submitted_ind_id}.'.'.$result->{chr_num}} = 1;
+    }
+    my %genotypes;
+    my %pop_genotypes;
+    for my $indiv (values %indivs) {
+       if (defined $indiv->{chr_1} and defined $indiv->{chr_2}) {
+           my $genotype = join('/',
+                               sort($indiv->{chr_1},
+                                    $indiv->{chr_2}));
+           $pop_genotypes{$indiv->{loc_pop_id}}{$genotype}++;
+           $pop_genotypes{$indiv->{loc_pop_id}}{total}++;
+           $genotypes{$genotype}++;
+           $genotypes{total}++;
+       }
+    }
+    return {pop_alleles   => \%pop_alleles,
+           pop_genotypes => \%pop_genotypes,
+           genotypes     => \%genotypes,
+           alleles       => \%alleles,
+          };
+}
+
+sub allele_freq{
+    my ($freq_table,$allele) = @_;
+    if (defined $allele) {
+       return ((exists $freq_table->{$allele} and defined $freq_table->{$allele}) ?
+               $freq_table->{$allele}/$freq_table->{total} : 0);
+    }
+    my @alleles = grep {$_ ne 'total'} keys %{$freq_table};
+    return join(',',map {$_.':'.(defined $freq_table->{$_} ?
+                                $freq_table->{$_}/$freq_table->{total} : 0)} @alleles);
+}
+
+
+sub find_rsid {
+    my %param = @_;
+
+    my $rv = $param{sth}->execute($param{chr},$param{pos}) //
+       die "Unable to execute statement properly: ".$param{dbh}->errstr;
+    my @snp_ids = grep {$_ =~ /^\d+$/}
+       map {ref $_ eq 'ARRAY'? @{$_}:$_}
+           map {ref $_ eq 'ARRAY'? @{$_}:$_}
+               $sth->fetchall_arrayref([0]);
+    if ($sth->err) {
+       print STDERR $sth->errstr;
+       return '?';
+    }
+    if (not @snp_ids) {
+       return '.';
+    }
+    return join(',',@snp_ids);
+}
+
+sub dna_complement{
+    my ($seq) = @_;
+    return join('',map {tr /ATCG/TAGC/; $_} reverse split //,$seq);
+}
+
+sub convert_to_prot_seq{
+    my ($mrna) = @_;
+    $mrna =~ s/U/T/g;
+    my $prot = '';
+    for my $pos (0..(length($mrna) / 3 - 1)) {
+       my $codon = substr($mrna,$pos*3,3);
+       if (not exists $prot_table->{$codon}) {
+           # this isn't quite right, as we should really be handling
+           # ambiguities which do not result in different amino
+           # acids, but for the time being, do it this way.
+           $prot .= 'N';
+       }
+       else {
+           $prot .= $prot_table->{$codon};
+       }
+    }
+    return $prot;
+}
+
+
+__DATA__
+ATT    I       Ile     Isoleucine
+ATC    I       Ile     Isoleucine
+ATA    I       Ile     Isoleucine
+CTT    L       Leu     Leucine
+CTC    L       Leu     Leucine
+CTA    L       Leu     Leucine
+CTG    L       Leu     Leucine
+TTA    L       Leu     Leucine
+TTG    L       Leu     Leucine
+GTT    V       Val     Valine
+GTC    V       Val     Valine
+GTA    V       Val     Valine
+GTG    V       Val     Valine
+TTT    F       Phe     Phenylalanine
+TTC    F       Phe     Phenylalanine
+ATG    M       Met     Methionine
+TGT    C       Cys     Cysteine
+TGC    C       Cys     Cysteine
+GCT    A       Ala     Alanine
+GCC    A       Ala     Alanine
+GCA    A       Ala     Alanine
+GCG    A       Ala     Alanine
+GGT    G       Gly     Glycine
+GGC    G       Gly     Glycine
+GGA    G       Gly     Glycine
+GGG    G       Gly     Glycine
+CCT    P       Pro     Proline
+CCC    P       Pro     Proline
+CCA    P       Pro     Proline
+CCG    P       Pro     Proline
+ACT    T       Thr     Threonine
+ACC    T       Thr     Threonine
+ACA    T       Thr     Threonine
+ACG    T       Thr     Threonine
+TCT    S       Ser     Serine
+TCC    S       Ser     Serine
+TCA    S       Ser     Serine
+TCG    S       Ser     Serine
+AGT    S       Ser     Serine
+AGC    S       Ser     Serine
+TAT    Y       Tyr     Tyrosine
+TAC    Y       Tyr     Tyrosine
+TGG    W       Trp     Tryptophan
+CAA    Q       Gln     Glutamine
+CAG    Q       Gln     Glutamine
+AAT    N       Asn     Asparagine
+AAC    N       Asn     Asparagine
+CAT    H       His     Histidine
+CAC    H       His     Histidine
+GAA    E       Glu     Glutamic acid
+GAG    E       Glu     Glutamic acid
+GAT    D       Asp     Aspartic acid
+GAC    D       Asp     Aspartic acid
+AAA    K       Lys     Lysine
+AAG    K       Lys     Lysine
+CGT    R       Arg     Arginine
+CGC    R       Arg     Arginine
+CGA    R       Arg     Arginine
+CGG    R       Arg     Arginine
+AGA    R       Arg     Arginine
+AGG    R       Arg     Arginine
+TAA    *       *       Stop codon
+TAG    *       *       Stop codon
+TGA    *       *       Stop codon