From 8d6648d9f933285fbcf635ae54225aa9dd01ffed Mon Sep 17 00:00:00 2001
From: Don Armstrong <don@donarmstrong.com>
Date: Wed, 21 Mar 2012 15:50:20 -0700
Subject: [PATCH] add base versions of routines

---
 bin/snp_info         | 706 +++++++++++++++++++++++++++++++++++++++++++
 bin/snp_study_table  | 200 ++++++++++++
 bin/vcf_add_snp_info | 611 +++++++++++++++++++++++++++++++++++++
 3 files changed, 1517 insertions(+)
 create mode 100755 bin/snp_info
 create mode 100755 bin/snp_study_table
 create mode 100755 bin/vcf_add_snp_info

diff --git a/bin/snp_info b/bin/snp_info
new file mode 100755
index 0000000..545d2f3
--- /dev/null
+++ b/bin/snp_info
@@ -0,0 +1,706 @@
+#!/usr/bin/perl
+# snp_info outputs additional snp information is released under the
+# terms of the GPL version 2, or any later version, at your option.
+# See the file README and COPYING for more information.
+# Copyright 2011 by Don Armstrong <don@donarmstrong.com>.
+
+
+use warnings;
+use strict;
+
+BEGIN {$|=1};
+
+use Getopt::Long;
+use Pod::Usage;
+
+=head1 NAME
+
+snp_info - output snp information
+
+=head1 SYNOPSIS
+
+ [options]
+
+ Options:
+  --debug, -d debugging level (Default 0)
+  --help, -h display this help
+  --man, -m display manual
+
+=head1 OPTIONS
+
+=over
+
+=item B<--nearest-gene>
+
+Return the name of the nearest gene if the SNP isn't in a gene
+
+=item B<--debug, -d>
+
+Debug verbosity. (Default 0)
+
+=item B<--help, -h>
+
+Display brief usage information.
+
+=item B<--man, -m>
+
+Display this manual.
+
+=back
+
+=head1 EXAMPLES
+
+echo 'rs1782034' | snp_info;
+
+=cut
+
+
+use vars qw($DEBUG);
+use DBI;
+use Data::Dumper;
+use Scalar::Util qw(looks_like_number);
+
+
+my %options = (debug           => 0,
+	       help            => 0,
+	       man             => 0,
+	       verbose         => 0,
+	       fix_pos         => 1,
+	       nearest_gene    => 0,
+	       build           => '135',
+	       refseq          => '37_3',
+	       );
+
+GetOptions(\%options,
+	   'verbose|v+',
+	   'fix_pos|fix-pos!',
+	   'nearest_gene|nearest-gene!',
+	   'debug|d+','help|h|?','man|m');
+
+pod2usage() if $options{help};
+pod2usage({verbose=>2}) if $options{man};
+
+$DEBUG = $options{debug};
+
+my @USAGE_ERRORS;
+# if (1) {
+#      push @USAGE_ERRORS,"You must pass something";
+# }
+
+pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
+
+
+my $dbh = DBI->connect("dbi:Pg:service=snp",'','',{AutoCommit => 0}) or
+    die "Unable to connect to database: ".$DBI::errstr;
+
+my $real_snp_sth = $dbh->prepare(<<'END') // die "Unable to prepare realsnp statement: ".$dbh->errstr;
+SELECT DISTINCT rscurrent FROM rsmergearch WHERE rslow=? OR rshigh=?
+END
+
+my $sth_snp_history = $dbh->prepare(<<'END') // die "Unable to prepare snp_history statement: ".$dbh->errstr;
+SELECT * FROM snphistory WHERE snp_id=?;
+END
+
+my $snp_chr_pos_sth = $dbh->prepare(<<"END") // die "Unable to prepare snpinfo statement: ".$dbh->errstr;
+SELECT scpr.snp_id AS snp_id,
+       scpr.chr AS chr,
+       scpr.pos AS pos,
+       scl.phys_pos_from AS pos2,
+       scpr.orien AS orien,
+       scl.allele AS ref,
+       uv.var_str AS var_str,
+       ruv.var_str AS rev_var_str,
+       ml.locus_id AS uid,
+       ml.locus_symbol AS symbol,
+       gitn.gene_name AS description,
+       scl.aln_quality AS alignment_quality
+       FROM snp s
+         JOIN b$options{build}_snpchrposonref_$options{refseq} scpr ON s.snp_id=scpr.snp_id
+         JOIN b$options{build}_snpcontigloc_$options{refseq} scl ON scpr.snp_id=scl.snp_id
+         JOIN b$options{build}_contiginfo_$options{refseq} ci ON scl.ctg_id = ci.ctg_id
+         LEFT OUTER JOIN b$options{build}_snpcontiglocusid_$options{refseq} ml ON s.snp_id=ml.snp_id
+         LEFT OUTER JOIN geneidtoname gitn ON ml.locus_id=gitn.gene_id
+         JOIN univariation uv ON s.univar_id=uv.univar_id
+         JOIN univariation ruv ON uv.rev_univar_id=ruv.univar_id
+WHERE (ci.group_term LIKE 'GRCh%' OR ci.group_term='Primary_Assembly') AND s.snp_id=? ORDER BY scl.aln_quality;
+END
+
+my $sth = $dbh->prepare(<<"END") // die "Unable to prepare snpinfo statement: ".$dbh->errstr;
+SELECT snp_id FROM b$options{build}_snpchrposonref_$options{refseq} WHERE chr=? AND pos=?;
+END
+
+my $sth_mrna_ss = $dbh->prepare(<<'END') // die "Unable to prepare mrna_ss statement: ".$dbh->errstr;
+SELECT * FROM mrna WHERE start<=? AND stop>=? AND chr=? ORDER BY variant;
+END
+
+my $sth_cds_ss = $dbh->prepare(<<'END') // die "Unable to prepare cds_ss statement: ".$dbh->errstr;
+SELECT * FROM cds WHERE start<=? AND stop>=? AND chr=? ORDER BY variant;
+END
+
+my $sth_mrna_cds = $dbh->prepare(<<'END') // die "Unable to prepare mrna_cds statement: ".$dbh->errstr;
+SELECT * FROM mrna_cds_table WHERE cds_gi=?;
+END
+
+my $sth_allele_freq = $dbh->prepare(<<'END') // die "Unable to prepare allele_freq statement: ".$dbh->errstr;
+SELECT ssl.snp_id AS snp_id,
+       si.subsnp_id AS subsnp_id,
+       si.chr AS chr,
+       si.submitted_ind_id AS submitted_ind_id,
+       ga.chr_num AS chr_num,
+       ga.rev_flag AS rev_flag,
+       a.allele AS allele
+  FROM snpsubsnplink ssl
+    JOIN subind si on ssl.subsnp_id=si.subsnp_id
+    JOIN gtyallele ga on si.gty_id=ga.gty_id
+    JOIN allele a on ga.fwd_allele_id = a.allele_id
+    WHERE ssl.snp_id=? AND
+          si.chr=? AND ga.rev_flag = '0' AND
+          a.allele <> 'N';
+END
+
+my $sth_rsid_info = $dbh->prepare(<<'END') // die "Unable to prepare rsid info statement: ".$dbh->errstr;
+SELECT * FROM cds WHERE start<=? AND stop>=? AND chr=? ORDER BY variant;
+END
+
+my $sth_near_gene_start = $dbh->prepare(<<'END') // die "Unable to prepare near gene start statement: ".$dbh->errstr;
+SELECT cds.start AS start,
+       cds.stop AS stop,
+       cds.gene AS symbol,
+       gitn.gene_name AS description
+    FROM cds
+      LEFT OUTER JOIN geneidtoname gitn ON cds.geneid=gitn.gene_id
+    WHERE cds.chr = ? AND cds.start > ?
+    ORDER BY cds.start ASC LIMIT 1;
+END
+
+my $sth_near_gene_stop = $dbh->prepare(<<'END') // die "Unable to prepare near gene stop statement: ".$dbh->errstr;
+SELECT cds.start AS start,
+       cds.stop AS stop,
+       cds.gene AS symbol,
+       gitn.gene_name AS description
+    FROM cds
+      LEFT OUTER JOIN geneidtoname gitn ON cds.geneid=gitn.gene_id
+    WHERE cds.chr = ? AND cds.stop < ?
+    ORDER BY cds.stop DESC LIMIT 1;
+END
+
+my $prot_table;
+
+while (<DATA>) {
+    chomp;
+    my ($codon,$aa,$aa_med,$aa_long) = split /\t/;
+    $prot_table->{$codon} = $aa;
+}
+
+my @headers = qw(id chr pos ref alt);
+my @row_keys_to_initialize = qw(gene nonsyn aa_ref aa_alt codon_ref codon_alt cdspos protpos loc num var acc phase reffreq altfreq refaltn);
+print join("\t",@headers,'orig_id',@row_keys_to_initialize)."\n";
+while (<>) {
+    print $_ and next if /^#/;
+    chomp;
+    my @row = split /\t/, $_;
+    my %row;
+    @row{@headers} = @row[0..$#headers];
+    $row{orig_id} = $row{id};
+    # this is taken directly from vcf_fill_rsid; we want it for the time being anyway
+    if ($row{id} !~ /^(?:rs|ss)?\d+(?:,\s*(?:rs|ss)?\d+)*$/) {
+	my $new_id = find_rsid(chr => fixup_chr($row{chr}),
+			       # snps are 0 indexed
+			       pos => $row{pos} - 1,
+			       dbh => $dbh,
+			       sth => $sth,
+			      );
+	if ($new_id !~ /^(?:[\.\?])$/) {
+	    $row{id} = "rs".$new_id;
+	}
+    }
+    if ($row{id} =~ /(rs|ss)(\d+)/) {
+	my $type = $1;
+	my $id = $2;
+	$id = find_real_snp_id(id => $id,
+			       dbh => $dbh,
+			       sth => $real_snp_sth,
+			      );
+	if (not defined $id and
+	    is_retired_snp(id  => $id,
+			   dbh => $dbh,
+			   sth => $sth_snp_history,
+			  )) {
+	    if (defined $row{chr} and defined $row{pos}) {
+		$row{id} = "$row{chr}-$row{pos}";
+	    }
+	}
+	else {
+	    $row{id} = "$type$id";
+	}
+    }
+
+    if (defined $row{id} and ($options{fix_pos} or
+			      not defined $row{pos} or
+			      not defined $row{chr} or
+			      not defined $row{ref} or
+			      not defined $row{alt})) {
+	my %info = find_chr_pos(id => [$row{id} =~ /(?:rs|ss)(\d+)/]->[0],
+				dbh => $dbh,
+				sth => $snp_chr_pos_sth,
+			       );
+	for my $inf (qw(pos chr ref alt)) {
+	    $row{$inf} = $info{$inf} if $options{fix_pos} and defined $info{$inf} or not defined $row{$inf};
+	}
+	# handle partially aligned snps
+	if ($options{fix_pos} and not defined $row{pos}) {
+	    $row{pos} = $info{pos2};
+	}
+	if (not defined $row{pos} or not defined $row{chr}) {
+	    if (defined $row{pos}) {
+		print STDERR "$row{id} is ambiguously placed\n";
+	    }
+	    else {
+		print STDERR "Unable to find a position/chr for $row{id}\n";
+	    }
+	}
+    }
+    my $mrna_information = mrna_information(dbh => $dbh,
+					    sth => $sth_mrna_ss,
+					    # these are all 1 indexed
+					    pos => $row{pos},
+					    chr => fixup_chr($row{chr}),
+					    ref => $row{ref},
+					    alt => $row{alt},
+					    cds_sth => $sth_cds_ss,
+					    mrna_cds_sth => $sth_mrna_cds,
+					   );
+    @row{@row_keys_to_initialize} = ('') x @row_keys_to_initialize;
+    if (keys %{$mrna_information}) {
+	# strip out existing information in $row{info}
+	my @extra_info;
+	$row{gene}     = $mrna_information->{gene};
+	$row{nonsyn}   = 1 if defined $mrna_information->{nonsyn} and $mrna_information->{nonsyn};
+	$row{aa_ref}    = $mrna_information->{orig_aa} if defined $mrna_information->{orig_aa};
+	$row{aa_alt}    = $mrna_information->{new_aa} if defined $mrna_information->{new_aa};
+	$row{codon_ref} = $mrna_information->{orig_codon} if defined $mrna_information->{orig_codon};
+	$row{codon_alt} = $mrna_information->{new_codon} if defined $mrna_information->{new_codon};
+	$row{cdspos}   = $mrna_information->{cds_pos} if defined $mrna_information->{cds_pos};
+	$row{protpos}  = $mrna_information->{prot_pos} if defined $mrna_information->{prot_pos};
+	$row{loc}      = $mrna_information->{exon}?'exon':'intron' if defined $mrna_information->{gene};
+	$row{num}      = $mrna_information->{number};
+	$row{var}      = $mrna_information->{variant} if defined $mrna_information->{variant};
+	$row{acc}      = $mrna_information->{accession};
+	$row{phase}    = $mrna_information->{phase} if defined $mrna_information->{phase};
+    }
+    if ((not defined $row{gene} or not length $row{gene})
+	and $options{nearest_gene}) {
+	$row{gene} = find_nearest_gene(dbh => $dbh,
+				       start_sth => $sth_near_gene_start,
+				       stop_sth => $sth_near_gene_stop,
+				       chr => fixup_chr($row{chr}),
+				       pos => $row{pos},
+				      );
+    }
+    my $allele_freq_information = allele_freq_information(dbh => $dbh,
+							  sth => $sth_allele_freq,
+							  id  => [$row{id} =~ /(?:rs|ss)(\d+)/]->[0],
+							  # these are all 1 indexed
+							  pos => $row{pos},
+							  chr => fixup_chr($row{chr}),
+							  ref => $row{ref},
+							  alt => $row{alt},
+							 ) if defined $row{id} and $row{id} =~ /^(?:rs|ss)/;
+    if (keys %{$allele_freq_information}) {
+	# strip out existing information in $row{info}
+	if (exists $allele_freq_information->{alleles} and keys %{$allele_freq_information->{alleles}}) {
+	    $row{reffreq} = sprintf('%.3f',allele_freq($allele_freq_information->{alleles},$row{ref}));
+	    $row{altfreq} = sprintf('%.3f',allele_freq($allele_freq_information->{alleles},$row{alt}));
+	    $row{refaltn} = $allele_freq_information->{alleles}{total};
+	}
+    }
+    print join("\t",map {defined $_?$_:'NA'} @row{@headers,'orig_id',@row_keys_to_initialize},$#row > $#headers?@row[@headers..$#row]:())."\n";
+    if ($options{verbose} > 0 and $. % 100 == 0) {
+	print STDERR "At line $.\n";
+    }
+}
+
+sub find_real_snp_id {
+    my %param = @_;
+
+    my %info;
+    my $rv = $param{sth}->execute($param{id},$param{id}) //
+	die "Unable to execute statement properly: ".$param{dbh}->errstr;
+    my ($real_id) = map {ref $_ ?@{$_}:()} map {ref $_ ?@{$_}:()} $param{sth}->fetchall_arrayref([0]);
+    if ($param{sth}->err) {
+	print STDERR $param{sth}->errstr;
+	$param{sth}->finish;
+	return $param{id};
+    }
+    $param{sth}->finish;
+    return $real_id // $param{id};
+}
+
+sub find_chr_pos {
+    my %param = @_;
+
+    my %info;
+    my $rv = $param{sth}->execute($param{id}) //
+	die "Unable to execute statement properly: ".$param{dbh}->errstr;
+    my $results = $param{sth}->fetchall_arrayref({});
+    if ($param{sth}->err) {
+	print STDERR $param{sth}->errstr;
+	$param{sth}->finish;
+	return %info;
+    }
+    for my $result (@{$results}) {
+	%info = %{$result};
+    }
+    if (keys %info) {
+	if ($info{chr} eq 'NotOn') {
+	    undef $info{chr};
+	}
+	if ($info{orien}) {
+	    $info{orien_var_str} = $info{rev_var_str};
+	}
+	else {
+	    $info{orien_var_str} = $info{var_str};
+	}
+	my @alleles = split qr{/},$info{orien_var_str};
+	($info{alt}) = grep {$_ ne $info{ref}} @alleles;
+	# fix $row{pos}
+	$info{pos}+=1;
+    }
+    $param{sth}->finish;
+    return %info;
+}
+
+sub find_nearest_gene {
+    my %param = @_;
+    return undef unless defined $param{chr} and defined $param{pos};
+
+    my %possible_genes;
+
+    for my $st (qw(start stop)) {
+	my $rv = $param{"${st}_sth"}->execute($param{chr},$param{pos}) //
+	    die "Unable to execute nearest gene $st statement properly: ".$param{dbh}->errstr;
+	my $results = $param{"${st}_sth"}->fetchall_arrayref({});
+	if ($param{"${st}_sth"}->err) {
+	    print STDERR $param{"${st}_sth"}->errstr;
+	    $param{"${st}_sth"}->finish;
+	    return '';
+	}
+	$possible_genes{$st} = $results->[0]
+    }
+    for my $st (qw(start stop)) {
+	# opposite of start/stop (stop/start)
+	my $st_op = $st eq q(start) ? q(stop) : q(start);
+	if (not defined $possible_genes{$st}{start}) {
+	    if (defined $possible_genes{$st_op}{start}) {
+		return $possible_genes{$st_op}{symbol}
+	    }
+	    else {
+		return undef;
+	    }
+	}
+    }
+    my $gene;
+    # if the start of the start gene is closer to the position of the
+    # snp than the stop of the stop gene, it's the nearest gene;
+    # otherwise, the stop gene is
+    if ($possible_genes{start}{start} - $param{pos} <
+	$param{pos} - $possible_genes{stop}{stop}
+       ) {
+	return $possible_genes{start}{symbol};
+    }
+    else {
+	return $possible_genes{stop}{symbol};
+    }
+}
+
+
+sub mrna_information{
+    my %param = @_;
+
+    my $rv = $param{sth}->execute($param{pos},$param{pos},$param{chr}) or
+	die "Unable to execute statement properly: ".$param{dbh}->errstr;
+
+    my $results = $param{sth}->fetchall_arrayref({});
+    if ($param{sth}->err) {
+	print STDERR $param{sth}->errstr;
+	return {};
+    }
+    my $chosen_result;
+    for my $result (@{$results}) {
+	if (not defined $chosen_result) {
+	    $chosen_result = $result;
+	    next;
+	}
+	if (not defined $chosen_result->{variant} or
+	    (defined $result->{variant} and
+	     $chosen_result->{variant} > $result->{variant})) {
+	    $chosen_result = $result;
+	    next;
+	}
+    }
+    if (defined $chosen_result and $chosen_result->{exon}) {
+	# try to figure out the cds if there is mrna information and
+	# we're in an exon
+	my $cds_rv = $param{cds_sth}->execute($param{pos},$param{pos},$param{chr}) or
+	    die "Unable to execute statement properly: ".$param{dbh}->errstr;
+	my $cds_results = $param{cds_sth}->fetchall_arrayref({});
+	if ($param{cds_sth}->err) {
+	    print STDERR $param{cds_sth}->errstr;
+	}
+	else {
+	    print STDERR Data::Dumper->Dump([$cds_results],[qw(cds_results)]) if $DEBUG;
+	    my $chosen_cds;
+	    for my $cds_result (@{$cds_results}) {
+		if (not defined $chosen_cds) {
+		    $chosen_cds = $cds_result;
+		    next;
+		}
+		if (not defined $cds_result->{variant} or
+		    (defined $chosen_cds->{variant} and
+		     $chosen_cds->{variant} ge $cds_result->{variant})) {
+		    $chosen_cds = $cds_result;
+		    next;
+		}
+	    }
+	    if (defined $chosen_cds and keys %{$chosen_cds}) {
+		my $comp_offset = $chosen_cds->{complement}?-1:1;
+		$chosen_result->{in_cds} = 1;
+		$chosen_result->{cds_pos} =
+		    ($chosen_cds->{complement}?$chosen_cds->{stop}-$param{pos}:$param{pos}-$chosen_cds->{start}) +
+			$chosen_cds->{position};
+		$chosen_result->{gene} = $chosen_cds->{gene};
+		$chosen_result->{pos} = $chosen_cds->{pos};
+		$chosen_result->{number} = $chosen_cds->{number};
+		$chosen_result->{variant} = $chosen_cds->{variant};
+		$chosen_result->{acc} = $chosen_cds->{accession};
+		$chosen_result->{prot_pos} = sprintf("%.1f",$chosen_result->{cds_pos}/ 3);
+		$chosen_result->{phase} = ($chosen_cds->{phase} + $param{pos} - $chosen_cds->{start}) % 3;
+		print STDERR Data::Dumper->Dump([$chosen_cds,$chosen_result],[qw(chosen_cds chosen_result)]) if $DEBUG;
+		my $mrna_cds_rv = $param{mrna_cds_sth}->execute($chosen_cds->{gi}) or
+		    die "Unable to execute statement properly: ".$param{dbh}->errstr;
+		my $mrna_cds_result = $param{mrna_cds_sth}->fetchrow_hashref();
+		if (defined $mrna_cds_result and keys %{$mrna_cds_result}) {
+ 		    print STDERR Data::Dumper->Dump([$mrna_cds_result,$chosen_result,\%param],
+						    [qw(mrna_cds_result chosen_result *param)]) if $DEBUG;
+		    my $codon = substr $mrna_cds_result->{sequence},$chosen_result->{cds_pos} - (($chosen_result->{cds_pos}) % 3)  ,3;
+		    my $orig_aa = convert_to_prot_seq($codon);
+		    my $modified_codon = $codon;
+		    substr($modified_codon,$chosen_result->{cds_pos} % 3,1)=
+			($chosen_cds->{complement}?dna_complement($param{alt}):$param{alt});
+		    my $new_aa = convert_to_prot_seq($modified_codon);
+  		    print STDERR Dumper($mrna_cds_result,$chosen_result,$chosen_cds,{codon_pos => $chosen_result->{cds_pos} - ($chosen_result->{cds_pos} % 3),
+ 										     codon=>$codon,modified_codon=>$modified_codon,pos=>$param{pos}}) if $DEBUG;
+		    if ($DEBUG > 1) {
+			my $a = -1;
+			print STDERR map {$a++; ($a*3)." $_\n"} $mrna_cds_result->{sequence} =~ m/.../g;
+		    }
+ 		    $chosen_result->{orig_aa} = $orig_aa;
+		    $chosen_result->{new_aa} = $new_aa;
+		    $chosen_result->{new_codon} = $modified_codon;
+		    $chosen_result->{orig_codon} = $codon;
+		    $chosen_result->{nonsyn} = $new_aa ne $orig_aa ? 1 : 0;
+		}
+	    }
+	}
+    }
+    return $chosen_result // {};
+}
+
+
+sub fixup_chr {
+    my ($chr) = @_;
+    return $chr if not defined $chr;
+
+    $chr =~ s/^NC_00000?//;
+    $chr =~ s/^chr//;
+    if (looks_like_number($chr)) {
+	if ($chr == 23) {
+	    $chr = "X"
+	}
+	elsif ($chr == 24) {
+	    $chr = "Y"
+	}
+    }
+    return ($chr);
+}
+
+sub allele_freq_information{
+    my %param = @_;
+
+    return {} if $param{id} !~ /^\d+$/;
+
+    my $rv = $param{sth}->execute($param{id},$param{chr}) or
+	die "Unable to execute statement properly: ".$param{dbh}->errstr." id:".$param{id}." chr:".$param{chr};
+
+    my $results = $param{sth}->fetchall_arrayref({});
+    if ($param{sth}->err) {
+	print STDERR $param{sth}->errstr;
+	return {};
+    }
+    my %pop_alleles;
+    my %alleles;
+    my %seen_indiv;
+    my %indivs;
+    for my $result (@{$results}) {
+	next if exists $seen_indiv{$result->{submitted_ind_id}.'.'.$result->{chr_num}};
+	$indivs{$result->{submitted_ind_id}}{'chr_'.$result->{chr_num}} = $result->{allele};
+	if (defined $result->{loc_pop_id}) {
+	    $indivs{$result->{submitted_ind_id}}{loc_pop_id} = $result->{loc_pop_id};
+	    $indivs{$result->{submitted_ind_id}}{pop_handle} = $result->{pop_handle};
+	    $pop_alleles{$result->{loc_pop_id}}{$result->{allele}}++;
+	    $pop_alleles{$result->{loc_pop_id}}{total}++;
+	}
+	$alleles{$result->{allele}}++;
+	$alleles{total}++;
+	$seen_indiv{$result->{submitted_ind_id}.'.'.$result->{chr_num}} = 1;
+    }
+    my %genotypes;
+    my %pop_genotypes;
+    for my $indiv (values %indivs) {
+	if (defined $indiv->{chr_1} and defined $indiv->{chr_2}) {
+	    my $genotype = join('/',
+				sort($indiv->{chr_1},
+				     $indiv->{chr_2}));
+	    if (defined $indiv->{loc_pop_id}) {
+		$pop_genotypes{$indiv->{loc_pop_id}}{$genotype}++;
+		$pop_genotypes{$indiv->{loc_pop_id}}{total}++;
+	    }
+	    $genotypes{$genotype}++;
+	    $genotypes{total}++;
+	}
+    }
+    return {pop_alleles   => \%pop_alleles,
+	    pop_genotypes => \%pop_genotypes,
+	    genotypes     => \%genotypes,
+	    alleles       => \%alleles,
+	   };
+}
+
+sub allele_freq{
+    my ($freq_table,$allele) = @_;
+    my $flipped_allele = $allele;
+    $flipped_allele =~ tr/ACTG/TGAC/;
+    if (defined $allele) {
+	if (exists $freq_table->{$allele} and defined $freq_table->{$allele}) {
+	    return $freq_table->{$allele}/$freq_table->{total};
+	}
+	elsif (exists $freq_table->{$flipped_allele} and defined $freq_table->{$flipped_allele}) {
+	    return $freq_table->{$flipped_allele}/$freq_table->{total};
+	}
+	return 0;
+    }
+    my @alleles = grep {$_ ne 'total'} keys %{$freq_table};
+    return join(',',map {$_.':'.(defined $freq_table->{$_} ?
+				 $freq_table->{$_}/$freq_table->{total} : 0)} @alleles);
+}
+
+
+sub find_rsid {
+    my %param = @_;
+
+    my $rv = $param{sth}->execute($param{chr},$param{pos}) //
+	die "Unable to execute statement properly: ".$param{dbh}->errstr;
+    my @snp_ids = grep {$_ =~ /^\d+$/}
+	map {ref $_ eq 'ARRAY'? @{$_}:$_}
+	    map {ref $_ eq 'ARRAY'? @{$_}:$_}
+		$sth->fetchall_arrayref([0]);
+    if ($sth->err) {
+	print STDERR $sth->errstr;
+	return '?';
+    }
+    if (not @snp_ids) {
+	return '.';
+    }
+    return join(',',@snp_ids);
+}
+
+sub dna_complement{
+    my ($seq) = @_;
+    return join('',map {tr /ATCG/TAGC/; $_} reverse split //,$seq);
+}
+
+sub convert_to_prot_seq{
+    my ($mrna) = @_;
+    $mrna =~ s/U/T/g;
+    my $prot = '';
+    for my $pos (0..(length($mrna) / 3 - 1)) {
+	my $codon = substr($mrna,$pos*3,3);
+	if (not exists $prot_table->{$codon}) {
+	    # this isn't quite right, as we should really be handling
+	    # ambiguities which do not result in different amino
+	    # acids, but for the time being, do it this way.
+	    $prot .= 'N';
+	}
+	else {
+	    $prot .= $prot_table->{$codon};
+	}
+    }
+    return $prot;
+}
+
+
+__DATA__
+ATT	I	Ile	Isoleucine
+ATC	I	Ile	Isoleucine
+ATA	I	Ile	Isoleucine
+CTT	L	Leu	Leucine
+CTC	L	Leu	Leucine
+CTA	L	Leu	Leucine
+CTG	L	Leu	Leucine
+TTA	L	Leu	Leucine
+TTG	L	Leu	Leucine
+GTT	V	Val	Valine
+GTC	V	Val	Valine
+GTA	V	Val	Valine
+GTG	V	Val	Valine
+TTT	F	Phe	Phenylalanine
+TTC	F	Phe	Phenylalanine
+ATG	M	Met	Methionine
+TGT	C	Cys	Cysteine
+TGC	C	Cys	Cysteine
+GCT	A	Ala	Alanine
+GCC	A	Ala	Alanine
+GCA	A	Ala	Alanine
+GCG	A	Ala	Alanine
+GGT	G	Gly	Glycine
+GGC	G	Gly	Glycine
+GGA	G	Gly	Glycine
+GGG	G	Gly	Glycine
+CCT	P	Pro	Proline
+CCC	P	Pro	Proline
+CCA	P	Pro	Proline
+CCG	P	Pro	Proline
+ACT	T	Thr	Threonine
+ACC	T	Thr	Threonine
+ACA	T	Thr	Threonine
+ACG	T	Thr	Threonine
+TCT	S	Ser	Serine
+TCC	S	Ser	Serine
+TCA	S	Ser	Serine
+TCG	S	Ser	Serine
+AGT	S	Ser	Serine
+AGC	S	Ser	Serine
+TAT	Y	Tyr	Tyrosine
+TAC	Y	Tyr	Tyrosine
+TGG	W	Trp	Tryptophan
+CAA	Q	Gln	Glutamine
+CAG	Q	Gln	Glutamine
+AAT	N	Asn	Asparagine
+AAC	N	Asn	Asparagine
+CAT	H	His	Histidine
+CAC	H	His	Histidine
+GAA	E	Glu	Glutamic acid
+GAG	E	Glu	Glutamic acid
+GAT	D	Asp	Aspartic acid
+GAC	D	Asp	Aspartic acid
+AAA	K	Lys	Lysine
+AAG	K	Lys	Lysine
+CGT	R	Arg	Arginine
+CGC	R	Arg	Arginine
+CGA	R	Arg	Arginine
+CGG	R	Arg	Arginine
+AGA	R	Arg	Arginine
+AGG	R	Arg	Arginine
+TAA	*	*	Stop codon
+TAG	*	*	Stop codon
+TGA	*	*	Stop codon
diff --git a/bin/snp_study_table b/bin/snp_study_table
new file mode 100755
index 0000000..6a8d887
--- /dev/null
+++ b/bin/snp_study_table
@@ -0,0 +1,200 @@
+#! /usr/bin/perl
+# snp_study_table outputs a table of study results, and is released
+# under the terms of the GPL version 2, or any later version, at your
+# option. See the file README and COPYING for more information.
+# Copyright 2011 by Don Armstrong <don@donarmstrong.com>.
+# $Id: perl_script 1825 2011-01-02 01:53:43Z don $
+
+
+use warnings;
+use strict;
+
+use Getopt::Long;
+use Pod::Usage;
+
+=head1 NAME
+
+snp_study_table - output an xls table of study results
+
+=head1 SYNOPSIS
+
+snp_study_table [options] [snps]
+
+ Options:
+  --output output filename (Default STDOUT)
+  --debug, -d debugging level (Default 0)
+  --help, -h display this help
+  --man, -m display manual
+
+=head1 SNPS
+
+SNPS can be specified as rs12345 or as chr-pos; in the later case, the
+ga_chr_pos table will be used, in the former, the ga_snp table will be
+used.
+
+=head1 OPTIONS
+
+=over
+
+=item B<--output>
+
+Output filename (defaults to STDOUT)
+
+=item B<--debug, -d>
+
+Debug verbosity. (Default 0)
+
+=item B<--help, -h>
+
+Display brief usage information.
+
+=item B<--man, -m>
+
+Display this manual.
+
+=back
+
+=head1 EXAMPLES
+
+
+=cut
+
+
+use vars qw($DEBUG);
+
+use DBI;
+use Spreadsheet::WriteExcel;
+
+my %options = (debug           => 0,
+	       help            => 0,
+	       man             => 0,
+	       );
+
+GetOptions(\%options,
+	   'output|o=s',
+	   'debug|d+','help|h|?','man|m');
+
+pod2usage() if $options{help};
+pod2usage({verbose=>2}) if $options{man};
+
+$DEBUG = $options{debug};
+
+my @USAGE_ERRORS;
+
+pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
+
+my $out_fh = \*STDOUT;
+if (exists $options{output}) {
+    $out_fh = IO::File->new($options{output},'w') or
+	die "Unable to open $options{output} for writing: $!";
+}
+my $wb = Spreadsheet::WriteExcel->new($out_fh);
+my $ws = $wb->add_worksheet('snp_results');
+
+my $dbh = DBI->connect("dbi:Pg:service=snp",'','',{AutoCommit => 0}) or
+    die "Unable to connect to database: ".$DBI::errstr;
+
+my $sth_ga_info = $dbh->prepare(<<'END') // die "Unable to prepare ga_snp info statement: ".$dbh->errstr;
+SELECT * FROM ga_snp WHERE snp_id=?;
+END
+
+my $sth_ga_cp_info = $dbh->prepare(<<'END') // die "Unable to prepare ga_snp info statement: ".$dbh->errstr;
+SELECT * FROM ga_chr_pos WHERE chr=? AND pos=?;
+END
+
+
+if (not @ARGV) {
+    while (<STDIN>) {
+	next if /^#/;
+	chomp;
+	push @ARGV,$_;
+    }
+}
+
+my %studies;
+my %snps;
+for my $snp (@ARGV) {
+    my $results;
+    if ($snp =~ /^(?:rs)(\d+)$/) {
+	$results = get_snp_results(dbh => $dbh,
+				   sth => $sth_ga_info,
+				   bind => [$1],
+				  );
+    }
+    elsif ($snp =~ /^(\d+)-(\d+)$/) {
+	$results = get_snp_results(dbh => $dbh,
+				   sth => $sth_ga_cp_info,
+				   bind => [$1,$2],
+				  );
+    }
+    else {
+	print STDERR "Invalid snp format '$snp'\n";
+	next;
+    }
+    $snps{$snp} = $results;
+    for my $result (@{$results}) {
+	$studies{$result->{study_name}}{$result->{subpart_name}} = 1;
+    }
+}
+$dbh->disconnect();
+
+#column names
+my @cn = ('A'..'Z','AA'..'ZZ');
+
+my $r = 1;
+my $c = 0;
+my $f_center = $wb->add_format(align => 'center');
+$ws->write($cn[$c++].($r+2),'SNP');
+for my $study (sort keys %studies) {
+    my $n_substudy = keys %{$studies{$study}};
+    $ws->write($cn[$c].$r,$study);
+    $ws->merge_range($r-1,$c,$r-1,$c+$n_substudy*3-1,$study,$f_center);
+    for my $substudy (sort keys %{$studies{$study}}) {
+	$ws->write($cn[$c].($r+1),$substudy);
+	$ws->merge_range($r,$c,$r,$c+2,$substudy,$f_center);
+	$ws->write($cn[$c].($r+2),'P value');
+	$ws->write($cn[$c+1].($r+2),'Q value');
+	$ws->write($cn[$c+2].($r+2),'FDR');
+	$c+=3;
+    }
+}
+$r+=3;
+$c=0;
+for my $snp (sort keys %snps) {
+    $ws->write($cn[$c++].$r,$snp);
+    my %snp_studies;
+    for my $row (@{$snps{$snp}}) {
+	$snp_studies{$row->{study_name}}{$row->{subpart_name}} = [@{$row}{qw(pvalue qvalue fdr)}];
+    }
+    for my $study (sort keys %studies) {
+	for my $substudy (sort keys %{$studies{$study}}) {
+	    if (not exists $snp_studies{$study} or
+		not exists $snp_studies{$study}{$substudy}
+	       ) {
+		$c+=3;
+	    }
+	    else {
+		for (0..2) {
+		    $ws->write($cn[$c++].$r,defined $snp_studies{$study}{$substudy}[$_]?$snp_studies{$study}{$substudy}[$_]:'');
+		}
+	    }
+	}
+    }
+    $r++;
+    $c=0;
+}
+
+sub get_snp_results{
+    my %param = @_;
+    my $rv = $param{sth}->execute(@{$param{bind}}) or
+	die "Unable to execute statement properly: ".$param{dbh}->errstr;
+    my $results = $param{sth}->fetchall_arrayref({});
+    if ($param{sth}->err) {
+	print STDERR $param{sth}->errstr;
+	return {};
+    }
+    return $results;
+}
+
+
+__END__
diff --git a/bin/vcf_add_snp_info b/bin/vcf_add_snp_info
new file mode 100755
index 0000000..db3dcb6
--- /dev/null
+++ b/bin/vcf_add_snp_info
@@ -0,0 +1,611 @@
+#! /usr/bin/perl
+# vcf_add_snp_info adds additional snp information to vcf files, and
+# is released under the terms of the GPL version 2, or any later
+# version, at your option. See the file README and COPYING for more
+# information.
+# Copyright 2011 by Don Armstrong <don@donarmstrong.com>.
+
+
+use warnings;
+use strict;
+
+BEGIN {$|=1};
+
+use Getopt::Long;
+use Pod::Usage;
+
+=head1 NAME
+
+vcf_add_snp_info - Add additional snp information to vcf files
+
+=head1 SYNOPSIS
+
+ [options]
+
+ Options:
+  --debug, -d debugging level (Default 0)
+  --help, -h display this help
+  --man, -m display manual
+
+=head1 OPTIONS
+
+=over
+
+=item B<--debug, -d>
+
+Debug verbosity. (Default 0)
+
+=item B<--help, -h>
+
+Display brief usage information.
+
+=item B<--man, -m>
+
+Display this manual.
+
+=back
+
+=head1 EXAMPLES
+
+vcf_fill_rsid foo.vcf > bar.vcf
+
+=cut
+
+
+use vars qw($DEBUG);
+use DBI;
+use Data::Dumper;
+use Scalar::Util qw(looks_like_number);
+
+
+my %options = (debug           => 0,
+	       help            => 0,
+	       man             => 0,
+	       verbose         => 0,
+	       build           => '135',
+	       refseq          => '37_3',
+	       );
+
+GetOptions(\%options,
+	   'build',
+	   'refseq',
+	   'verbose|v+',
+	   'debug|d+','help|h|?','man|m');
+
+pod2usage() if $options{help};
+pod2usage({verbose=>2}) if $options{man};
+
+$DEBUG = $options{debug};
+
+my @USAGE_ERRORS;
+# if (1) {
+#      push @USAGE_ERRORS,"You must pass something";
+# }
+
+pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
+
+
+my $dbh = DBI->connect("dbi:Pg:service=snp",'','',{AutoCommit => 0}) or
+    die "Unable to connect to database: ".$DBI::errstr;
+
+my $sth = $dbh->prepare(<<"END") // die "Unable to prepare snpinfo statement: ".$dbh->errstr;
+SELECT snp_id FROM b$options{build}_snpchrposonref_$options{refseq} WHERE chr=? AND pos=?;
+END
+
+my $real_snp_sth = $dbh->prepare(<<'END') // die "Unable to prepare realsnp statement: ".$dbh->errstr;
+SELECT DISTINCT rscurrent FROM rsmergearch WHERE rslow=? OR rshigh=?
+END
+
+my $sth_mrna_ss = $dbh->prepare(<<'END') // die "Unable to prepare mrna_ss statement: ".$dbh->errstr;
+SELECT * FROM mrna WHERE start<=? AND stop>=? AND chr=? ORDER BY variant;
+END
+
+my $sth_cds_ss = $dbh->prepare(<<'END') // die "Unable to prepare cds_ss statement: ".$dbh->errstr;
+SELECT * FROM cds WHERE start<=? AND stop>=? AND chr=? ORDER BY variant;
+END
+
+my $sth_mrna_cds = $dbh->prepare(<<'END') // die "Unable to prepare mrna_cds statement: ".$dbh->errstr;
+SELECT * FROM mrna_cds_table WHERE cds_gi=?;
+END
+
+my $sth_snp_history = $dbh->prepare(<<'END') // die "Unable to prepare snp_history statement: ".$dbh->errstr;
+SELECT * FROM snphistory WHERE snp_id=?;
+END
+
+
+my $sth_allele_freq = $dbh->prepare(<<'END') // die "Unable to prepare allele_freq statement: ".$dbh->errstr;
+SELECT ssl.snp_id AS snp_id,
+       si.subsnp_id AS subsnp_id,
+--        ssl.substrand_reversed_flag,
+       si.chr AS chr,
+       si.submitted_ind_id AS submitted_ind_id,
+       ga.chr_num AS chr_num,
+--       si.submitted_strand_code,
+--       si.allele_flag,
+       ga.rev_flag AS rev_flag,
+       a.allele AS allele,
+       ra.allele AS rev_allele,
+       p.handle AS pop_handle,
+       p.loc_pop_id AS loc_pop_id
+  FROM snpsubsnplink ssl
+    JOIN subind si on ssl.subsnp_id=si.subsnp_id
+    JOIN gtyallele ga on si.gty_id=ga.gty_id
+    JOIN allele a on ga.fwd_allele_id = a.allele_id
+    JOIN allele ra on a.rev_allele_id = ra.allele_id
+    JOIN submittedindividual sind on si.submitted_ind_id=sind.submitted_ind_id
+    JOIN population p on sind.pop_id=p.pop_id
+    WHERE ssl.snp_id=? AND
+          si.chr=? AND ga.rev_flag = '0' AND
+          a.allele <> 'N';
+END
+
+my $sth_rsid_info = $dbh->prepare(<<'END') // die "Unable to prepare rsid info statement: ".$dbh->errstr;
+SELECT * FROM cds WHERE start<=? AND stop>=? AND chr=? ORDER BY variant;
+END
+
+my $sth_ga_info = $dbh->prepare(<<'END') // die "Unable to prepare ga_snp info statement: ".$dbh->errstr;
+SELECT * FROM ga_snp WHERE snp_id=?;
+END
+
+my $prot_table;
+
+while (<DATA>) {
+    chomp;
+    my ($codon,$aa,$aa_med,$aa_long) = split /\t/;
+    $prot_table->{$codon} = $aa;
+}
+
+my @headers = qw(chr pos id ref alt qual filter info format);
+while (<>) {
+    print $_ and next if /^#/;
+    chomp;
+    my @row = split /\t/, $_;
+    my %row;
+    @row{@headers} = @row[0..$#headers];
+    # this is taken directly from vcf_fill_rsid; we want it for the time being anyway
+    if ($row{id} !~ /^(?:rs|ss)?\d+(?:,\s*(?:rs|ss)?\d+)*$/) {
+	$row{id} = find_rsid(chr => fixup_chr($row{chr}),
+			     # snps are 0 indexed
+			     pos => $row{pos} - 1,
+			     dbh => $dbh,
+			     sth => $sth,
+			    );
+    }
+    if ($row{id} =~ /(rs|ss)(\d+)/) {
+	my $type = $1;
+	my $id = $2;
+	$id = find_real_snp_id(id => $id,
+			       dbh => $dbh,
+			       sth => $real_snp_sth,
+			      );
+	if (not defined $id and
+	    is_retired_snp(id  => $id,
+			   dbh => $dbh,
+			   sth => $sth_snp_history,
+			  )) {
+	    if (defined $row{chr} and defined $row{pos}) {
+		$row{id} = undef;
+	    }
+	}
+	else {
+	    $row{id} = "$type$id";
+	}
+    }
+    my $mrna_information = mrna_information(dbh => $dbh,
+					    sth => $sth_mrna_ss,
+					    # these are all 1 indexed
+					    pos => $row{pos},
+					    chr => fixup_chr($row{chr}),
+					    ref => $row{ref},
+					    alt => $row{alt},
+					    cds_sth => $sth_cds_ss,
+					    mrna_cds_sth => $sth_mrna_cds,
+					   );
+    if (keys %{$mrna_information}) {
+	# strip out existing information in $row{info}
+	$row{info} =~ s{(?:GENE|(?:CDS|PROT|)POS|NUM|VAR|ACC|NONSYN|(?:ALT|REF)(?:AA|CODON))\=(?:[^\;]+|"[^"]+")(?:;|$)}{}g;
+	my @extra_info;
+	push @extra_info,"GENE=".$mrna_information->{gene};
+	push @extra_info,"NONSYN=".1 if defined $mrna_information->{nonsyn} and $mrna_information->{nonsyn};
+	push @extra_info,"REFAA=".$mrna_information->{orig_aa} if defined $mrna_information->{orig_aa};
+	push @extra_info,"ALTAA=".$mrna_information->{new_aa} if defined $mrna_information->{new_aa};
+	push @extra_info,"REFCODON=".$mrna_information->{orig_codon} if defined $mrna_information->{orig_codon};
+	push @extra_info,"ALTCODON=".$mrna_information->{new_codon} if defined $mrna_information->{new_codon};
+	push @extra_info,"CDSPOS=".$mrna_information->{cds_pos} if defined $mrna_information->{cds_pos};
+	push @extra_info,"PROTPOS=".$mrna_information->{prot_pos} if defined $mrna_information->{prot_pos};
+	push @extra_info,"POS=".($mrna_information->{exon}?'exon':'intron');
+	push @extra_info,"NUM=".$mrna_information->{number};
+	push @extra_info,"VAR=".$mrna_information->{variant} if defined $mrna_information->{variant};
+	push @extra_info,"ACC=".$mrna_information->{accession};
+	push @extra_info,"ACCVER=".$mrna_information->{accession_ver};
+	push @extra_info,"PHASE=".$mrna_information->{phase} if defined $mrna_information->{phase};
+ 	$row{info} = join(';',@extra_info,$row{info});
+    }
+    my $allele_freq_information = allele_freq_information(dbh => $dbh,
+							  sth => $sth_allele_freq,
+							  id  => $row{id},
+							  # these are all 1 indexed
+							  pos => $row{pos},
+							  chr => fixup_chr($row{chr}),
+							  ref => $row{ref},
+							  alt => $row{alt},
+							 );
+    if (keys %{$allele_freq_information}) {
+	# strip out existing information in $row{info}
+	$row{info} =~ s{(?:REFFREQ|ALTFREQ|REFALTN)\=(?:[^\;]+|"[^"]+")(?:;|$)}{}g;
+	my @extra_info;
+	if (exists $allele_freq_information->{alleles} and keys %{$allele_freq_information->{alleles}}) {
+	    push @extra_info,"REFFREQ=".sprintf('%.3f',allele_freq($allele_freq_information->{alleles},$row{ref}));
+	    push @extra_info,"ALTFREQ=".sprintf('%.3f',allele_freq($allele_freq_information->{alleles},$row{alt}));
+	    push @extra_info,"REFALTN=".$allele_freq_information->{alleles}{total};
+	}
+	$row{info} = join(';',@extra_info,$row{info});
+    }
+    my $ga_study_information = ga_study_information(dbh => $dbh,
+						    sth => $sth_ga_info,
+						    id  => $row{id},
+						   );
+    if (defined $ga_study_information) {
+	# strip out existing information in $row{info}
+	$row{info} =~ s{(?:GASTUDY|GASTUDYSIG)\=(?:[^\;]+|"[^"]+")(?:;|$)}{}g;
+	my @extra_info;
+	push @extra_info,"GASTUDY=".$ga_study_information->{studies};
+	push @extra_info,"GASTUDYSIG=".($ga_study_information->{significant}?'TRUE':'FALSE');
+	$row{info} = join(';',@extra_info,$row{info});
+    }
+    print join("\t",@row{@headers},$#row > $#headers?@row[@headers..$#row]:())."\n";
+    if ($options{verbose} > 0 and $. % 100 == 0) {
+	print STDERR "At line $.\n";
+    }
+}
+
+sub ga_study_information {
+    my %param = @_;
+
+    return undef unless $param{id} =~ /^\d+$/;
+    my $rv = $param{sth}->execute($param{id}) or
+	die "Unable to execute statement properly: ".$param{dbh}->errstr;
+    my $results = $param{sth}->fetchall_arrayref({});
+    if (not @{$results}) {
+	return undef;
+    }
+    my $any_significant=0;
+    my @values;
+    for my $result (sort {
+	if (looks_like_number($a->{pvalue})) {
+	    if (looks_like_number($b->{pvalue})) {
+		$a->{pvalue} <=> $b->{pvalue};
+	    } else {
+		-1;
+	    }
+	} elsif (looks_like_number($b->{pvalue})) {
+	    1;
+	} else {
+	    0;
+	}} @{$results}) {
+	if ($result->{significant}) {
+	    $any_significant=1;
+	}
+	my $value = $result->{study_name}.':'.$result->{subpart_name};
+	for my $t (qw(pvalue qvalue fdr)) {
+	    next unless exists $result->{$t} and defined $result->{$t} and length $result->{$t};
+	    my ($fl) = $t =~ m/^(.)/;
+	    $value .= ":$fl:".sprintf('%.3g',$result->{$t});
+	}
+	push @values, $value;
+    }
+    return {significant => $any_significant,
+	    studies     => join(',',@values),
+	   };
+
+}
+
+sub mrna_information{
+    my %param = @_;
+
+    my $rv = $param{sth}->execute($param{pos},$param{pos},$param{chr}) or
+	die "Unable to execute statement properly: ".$param{dbh}->errstr;
+
+    my $results = $param{sth}->fetchall_arrayref({});
+    if ($param{sth}->err) {
+	print STDERR $param{sth}->errstr;
+	return {};
+    }
+    my $chosen_result;
+    for my $result (@{$results}) {
+	if (not defined $chosen_result) {
+	    $chosen_result = $result;
+	    next;
+	}
+	if (not defined $chosen_result->{variant} or
+	    (defined $result->{variant} and
+	     $chosen_result->{variant} > $result->{variant})) {
+	    $chosen_result = $result;
+	    next;
+	}
+    }
+    if (defined $chosen_result and $chosen_result->{exon}) {
+	# try to figure out the cds if there is mrna information and
+	# we're in an exon
+	my $cds_rv = $param{cds_sth}->execute($param{pos},$param{pos},$param{chr}) or
+	    die "Unable to execute statement properly: ".$param{dbh}->errstr;
+	my $cds_results = $param{cds_sth}->fetchall_arrayref({});
+	if ($param{cds_sth}->err) {
+	    print STDERR $param{cds_sth}->errstr;
+	}
+	else {
+	    print STDERR Data::Dumper->Dump([$cds_results],[qw(cds_results)]) if $DEBUG;
+	    my $chosen_cds;
+	    for my $cds_result (@{$cds_results}) {
+		if (not defined $chosen_cds) {
+		    $chosen_cds = $cds_result;
+		    next;
+		}
+		if (not defined $cds_result->{variant} or
+		    (defined $chosen_cds->{variant} and
+		     $chosen_cds->{variant} ge $cds_result->{variant})) {
+		    $chosen_cds = $cds_result;
+		    next;
+		}
+	    }
+	    if (defined $chosen_cds and keys %{$chosen_cds}) {
+		my $comp_offset = $chosen_cds->{complement}?-1:1;
+		$chosen_result->{in_cds} = 1;
+		$chosen_result->{cds_pos} =
+		    ($chosen_cds->{complement}?$chosen_cds->{stop}-$param{pos}:$param{pos}-$chosen_cds->{start}) +
+			$chosen_cds->{position};
+		$chosen_result->{gene} = $chosen_cds->{gene};
+		$chosen_result->{pos} = $chosen_cds->{pos};
+		$chosen_result->{number} = $chosen_cds->{number};
+		$chosen_result->{variant} = $chosen_cds->{variant};
+		$chosen_result->{acc} = $chosen_cds->{accession};
+		$chosen_result->{accver} = $chosen_cds->{accession_ver};
+		$chosen_result->{prot_pos} = sprintf("%.1f",$chosen_result->{cds_pos}/ 3);
+		$chosen_result->{phase} = ($chosen_cds->{phase} + $param{pos} - $chosen_cds->{start}) % 3;
+		print STDERR Data::Dumper->Dump([$chosen_cds,$chosen_result],[qw(chosen_cds chosen_result)]) if $DEBUG;
+		my $mrna_cds_rv = $param{mrna_cds_sth}->execute($chosen_cds->{gi}) or
+		    die "Unable to execute statement properly: ".$param{dbh}->errstr;
+		my $mrna_cds_result = $param{mrna_cds_sth}->fetchrow_hashref();
+		if (defined $mrna_cds_result and keys %{$mrna_cds_result}) {
+ 		    print STDERR Data::Dumper->Dump([$mrna_cds_result,$chosen_result,\%param],
+						    [qw(mrna_cds_result chosen_result *param)]) if $DEBUG;
+		    my $codon = substr $mrna_cds_result->{sequence},$chosen_result->{cds_pos} - (($chosen_result->{cds_pos}) % 3)  ,3;
+		    my $orig_aa = convert_to_prot_seq($codon);
+		    my $modified_codon = $codon;
+		    substr($modified_codon,$chosen_result->{cds_pos} % 3,1)=
+			($chosen_cds->{complement}?dna_complement($param{alt}):$param{alt});
+		    my $new_aa = convert_to_prot_seq($modified_codon);
+  		    print STDERR Dumper($mrna_cds_result,$chosen_result,$chosen_cds,{codon_pos => $chosen_result->{cds_pos} - ($chosen_result->{cds_pos} % 3),
+ 										     codon=>$codon,modified_codon=>$modified_codon,pos=>$param{pos}}) if $DEBUG;
+		    if ($DEBUG > 1) {
+			my $a = -1;
+			print STDERR map {$a++; ($a*3)." $_\n"} $mrna_cds_result->{sequence} =~ m/.../g;
+		    }
+ 		    $chosen_result->{orig_aa} = $orig_aa;
+		    $chosen_result->{new_aa} = $new_aa;
+		    $chosen_result->{new_codon} = $modified_codon;
+		    $chosen_result->{orig_codon} = $codon;
+		    $chosen_result->{nonsyn} = $new_aa ne $orig_aa ? 1 : 0;
+		}
+	    }
+	}
+    }
+    return $chosen_result // {};
+}
+
+sub find_real_snp_id {
+    my %param = @_;
+
+    my %info;
+    my $rv = $param{sth}->execute($param{id},$param{id}) //
+	die "Unable to execute statement properly: ".$param{dbh}->errstr;
+    my ($real_id) = map {ref $_ ?@{$_}:()} map {ref $_ ?@{$_}:()} $param{sth}->fetchall_arrayref([0]);
+    if ($param{sth}->err) {
+	print STDERR $param{sth}->errstr;
+	$param{sth}->finish;
+	return $param{id};
+    }
+    $param{sth}->finish;
+    return $real_id // $param{id};
+}
+
+sub is_retired_snp {
+    my %param = @_;
+
+    my %info;
+    my $rv = $param{sth}->execute($param{id}) //
+	die "Unable to execute statement properly: ".$param{dbh}->errstr;
+    if ($param{sth}->rows >= 1) {
+	return 1;
+    }
+    return 0;
+}
+
+
+sub fixup_chr {
+    my ($chr) = @_;
+
+    $chr =~ s/^NC_00000?//;
+    $chr =~ s/^chr//;
+    if (looks_like_number($chr)) {
+	if ($chr == 23) {
+	    $chr = "X"
+	}
+	elsif ($chr == 24) {
+	    $chr = "Y"
+	}
+    }
+    return ($chr);
+}
+
+sub allele_freq_information{
+    my %param = @_;
+
+    return {} if $param{id} !~ /^\d+$/;
+
+    my $rv = $param{sth}->execute($param{id},$param{chr}) or
+	die "Unable to execute statement properly: ".$param{dbh}->errstr." id:".$param{id}." chr:".$param{chr};
+
+    my $results = $param{sth}->fetchall_arrayref({});
+    if ($param{sth}->err) {
+	print STDERR $param{sth}->errstr;
+	return {};
+    }
+    my %pop_alleles;
+    my %alleles;
+    my %seen_indiv;
+    my %indivs;
+    for my $result (@{$results}) {
+	next if exists $seen_indiv{$result->{submitted_ind_id}.'.'.$result->{chr_num}};
+	$indivs{$result->{submitted_ind_id}}{pop_handle} = $result->{pop_handle};
+	$indivs{$result->{submitted_ind_id}}{loc_pop_id} = $result->{loc_pop_id};
+	$indivs{$result->{submitted_ind_id}}{'chr_'.$result->{chr_num}} = $result->{allele};
+	if (defined $result->{loc_pop_id}) {
+	    $pop_alleles{$result->{loc_pop_id}}{$result->{allele}}++;
+	    $pop_alleles{$result->{loc_pop_id}}{total}++;
+	}
+	$alleles{$result->{allele}}++;
+	$alleles{total}++;
+	$seen_indiv{$result->{submitted_ind_id}.'.'.$result->{chr_num}} = 1;
+    }
+    my %genotypes;
+    my %pop_genotypes;
+    for my $indiv (values %indivs) {
+	if (defined $indiv->{chr_1} and defined $indiv->{chr_2}) {
+	    my $genotype = join('/',
+				sort($indiv->{chr_1},
+				     $indiv->{chr_2}));
+	    $pop_genotypes{$indiv->{loc_pop_id}}{$genotype}++;
+	    $pop_genotypes{$indiv->{loc_pop_id}}{total}++;
+	    $genotypes{$genotype}++;
+	    $genotypes{total}++;
+	}
+    }
+    return {pop_alleles   => \%pop_alleles,
+	    pop_genotypes => \%pop_genotypes,
+	    genotypes     => \%genotypes,
+	    alleles       => \%alleles,
+	   };
+}
+
+sub allele_freq{
+    my ($freq_table,$allele) = @_;
+    if (defined $allele) {
+	return ((exists $freq_table->{$allele} and defined $freq_table->{$allele}) ?
+		$freq_table->{$allele}/$freq_table->{total} : 0);
+    }
+    my @alleles = grep {$_ ne 'total'} keys %{$freq_table};
+    return join(',',map {$_.':'.(defined $freq_table->{$_} ?
+				 $freq_table->{$_}/$freq_table->{total} : 0)} @alleles);
+}
+
+
+sub find_rsid {
+    my %param = @_;
+
+    my $rv = $param{sth}->execute($param{chr},$param{pos}) //
+	die "Unable to execute statement properly: ".$param{dbh}->errstr;
+    my @snp_ids = grep {$_ =~ /^\d+$/}
+	map {ref $_ eq 'ARRAY'? @{$_}:$_}
+	    map {ref $_ eq 'ARRAY'? @{$_}:$_}
+		$sth->fetchall_arrayref([0]);
+    if ($sth->err) {
+	print STDERR $sth->errstr;
+	return '?';
+    }
+    if (not @snp_ids) {
+	return '.';
+    }
+    return join(',',@snp_ids);
+}
+
+sub dna_complement{
+    my ($seq) = @_;
+    return join('',map {tr /ATCG/TAGC/; $_} reverse split //,$seq);
+}
+
+sub convert_to_prot_seq{
+    my ($mrna) = @_;
+    $mrna =~ s/U/T/g;
+    my $prot = '';
+    for my $pos (0..(length($mrna) / 3 - 1)) {
+	my $codon = substr($mrna,$pos*3,3);
+	if (not exists $prot_table->{$codon}) {
+	    # this isn't quite right, as we should really be handling
+	    # ambiguities which do not result in different amino
+	    # acids, but for the time being, do it this way.
+	    $prot .= 'N';
+	}
+	else {
+	    $prot .= $prot_table->{$codon};
+	}
+    }
+    return $prot;
+}
+
+
+__DATA__
+ATT	I	Ile	Isoleucine
+ATC	I	Ile	Isoleucine
+ATA	I	Ile	Isoleucine
+CTT	L	Leu	Leucine
+CTC	L	Leu	Leucine
+CTA	L	Leu	Leucine
+CTG	L	Leu	Leucine
+TTA	L	Leu	Leucine
+TTG	L	Leu	Leucine
+GTT	V	Val	Valine
+GTC	V	Val	Valine
+GTA	V	Val	Valine
+GTG	V	Val	Valine
+TTT	F	Phe	Phenylalanine
+TTC	F	Phe	Phenylalanine
+ATG	M	Met	Methionine
+TGT	C	Cys	Cysteine
+TGC	C	Cys	Cysteine
+GCT	A	Ala	Alanine
+GCC	A	Ala	Alanine
+GCA	A	Ala	Alanine
+GCG	A	Ala	Alanine
+GGT	G	Gly	Glycine
+GGC	G	Gly	Glycine
+GGA	G	Gly	Glycine
+GGG	G	Gly	Glycine
+CCT	P	Pro	Proline
+CCC	P	Pro	Proline
+CCA	P	Pro	Proline
+CCG	P	Pro	Proline
+ACT	T	Thr	Threonine
+ACC	T	Thr	Threonine
+ACA	T	Thr	Threonine
+ACG	T	Thr	Threonine
+TCT	S	Ser	Serine
+TCC	S	Ser	Serine
+TCA	S	Ser	Serine
+TCG	S	Ser	Serine
+AGT	S	Ser	Serine
+AGC	S	Ser	Serine
+TAT	Y	Tyr	Tyrosine
+TAC	Y	Tyr	Tyrosine
+TGG	W	Trp	Tryptophan
+CAA	Q	Gln	Glutamine
+CAG	Q	Gln	Glutamine
+AAT	N	Asn	Asparagine
+AAC	N	Asn	Asparagine
+CAT	H	His	Histidine
+CAC	H	His	Histidine
+GAA	E	Glu	Glutamic acid
+GAG	E	Glu	Glutamic acid
+GAT	D	Asp	Aspartic acid
+GAC	D	Asp	Aspartic acid
+AAA	K	Lys	Lysine
+AAG	K	Lys	Lysine
+CGT	R	Arg	Arginine
+CGC	R	Arg	Arginine
+CGA	R	Arg	Arginine
+CGG	R	Arg	Arginine
+AGA	R	Arg	Arginine
+AGG	R	Arg	Arginine
+TAA	*	*	Stop codon
+TAG	*	*	Stop codon
+TGA	*	*	Stop codon
-- 
2.39.5