From: Don Armstrong Date: Wed, 21 Mar 2012 22:50:20 +0000 (-0700) Subject: add base versions of routines X-Git-Url: https://git.donarmstrong.com/?p=bio-dbsnp.git;a=commitdiff_plain add base versions of routines --- 8d6648d9f933285fbcf635ae54225aa9dd01ffed diff --git a/bin/snp_info b/bin/snp_info new file mode 100755 index 0000000..545d2f3 --- /dev/null +++ b/bin/snp_info @@ -0,0 +1,706 @@ +#!/usr/bin/perl +# snp_info outputs additional snp information is released under the +# terms of the GPL version 2, or any later version, at your option. +# See the file README and COPYING for more information. +# Copyright 2011 by Don Armstrong . + + +use warnings; +use strict; + +BEGIN {$|=1}; + +use Getopt::Long; +use Pod::Usage; + +=head1 NAME + +snp_info - output snp information + +=head1 SYNOPSIS + + [options] + + Options: + --debug, -d debugging level (Default 0) + --help, -h display this help + --man, -m display manual + +=head1 OPTIONS + +=over + +=item B<--nearest-gene> + +Return the name of the nearest gene if the SNP isn't in a gene + +=item B<--debug, -d> + +Debug verbosity. (Default 0) + +=item B<--help, -h> + +Display brief usage information. + +=item B<--man, -m> + +Display this manual. + +=back + +=head1 EXAMPLES + +echo 'rs1782034' | snp_info; + +=cut + + +use vars qw($DEBUG); +use DBI; +use Data::Dumper; +use Scalar::Util qw(looks_like_number); + + +my %options = (debug => 0, + help => 0, + man => 0, + verbose => 0, + fix_pos => 1, + nearest_gene => 0, + build => '135', + refseq => '37_3', + ); + +GetOptions(\%options, + 'verbose|v+', + 'fix_pos|fix-pos!', + 'nearest_gene|nearest-gene!', + 'debug|d+','help|h|?','man|m'); + +pod2usage() if $options{help}; +pod2usage({verbose=>2}) if $options{man}; + +$DEBUG = $options{debug}; + +my @USAGE_ERRORS; +# if (1) { +# push @USAGE_ERRORS,"You must pass something"; +# } + +pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS; + + +my $dbh = DBI->connect("dbi:Pg:service=snp",'','',{AutoCommit => 0}) or + die "Unable to connect to database: ".$DBI::errstr; + +my $real_snp_sth = $dbh->prepare(<<'END') // die "Unable to prepare realsnp statement: ".$dbh->errstr; +SELECT DISTINCT rscurrent FROM rsmergearch WHERE rslow=? OR rshigh=? +END + +my $sth_snp_history = $dbh->prepare(<<'END') // die "Unable to prepare snp_history statement: ".$dbh->errstr; +SELECT * FROM snphistory WHERE snp_id=?; +END + +my $snp_chr_pos_sth = $dbh->prepare(<<"END") // die "Unable to prepare snpinfo statement: ".$dbh->errstr; +SELECT scpr.snp_id AS snp_id, + scpr.chr AS chr, + scpr.pos AS pos, + scl.phys_pos_from AS pos2, + scpr.orien AS orien, + scl.allele AS ref, + uv.var_str AS var_str, + ruv.var_str AS rev_var_str, + ml.locus_id AS uid, + ml.locus_symbol AS symbol, + gitn.gene_name AS description, + scl.aln_quality AS alignment_quality + FROM snp s + JOIN b$options{build}_snpchrposonref_$options{refseq} scpr ON s.snp_id=scpr.snp_id + JOIN b$options{build}_snpcontigloc_$options{refseq} scl ON scpr.snp_id=scl.snp_id + JOIN b$options{build}_contiginfo_$options{refseq} ci ON scl.ctg_id = ci.ctg_id + LEFT OUTER JOIN b$options{build}_snpcontiglocusid_$options{refseq} ml ON s.snp_id=ml.snp_id + LEFT OUTER JOIN geneidtoname gitn ON ml.locus_id=gitn.gene_id + JOIN univariation uv ON s.univar_id=uv.univar_id + JOIN univariation ruv ON uv.rev_univar_id=ruv.univar_id +WHERE (ci.group_term LIKE 'GRCh%' OR ci.group_term='Primary_Assembly') AND s.snp_id=? ORDER BY scl.aln_quality; +END + +my $sth = $dbh->prepare(<<"END") // die "Unable to prepare snpinfo statement: ".$dbh->errstr; +SELECT snp_id FROM b$options{build}_snpchrposonref_$options{refseq} WHERE chr=? AND pos=?; +END + +my $sth_mrna_ss = $dbh->prepare(<<'END') // die "Unable to prepare mrna_ss statement: ".$dbh->errstr; +SELECT * FROM mrna WHERE start<=? AND stop>=? AND chr=? ORDER BY variant; +END + +my $sth_cds_ss = $dbh->prepare(<<'END') // die "Unable to prepare cds_ss statement: ".$dbh->errstr; +SELECT * FROM cds WHERE start<=? AND stop>=? AND chr=? ORDER BY variant; +END + +my $sth_mrna_cds = $dbh->prepare(<<'END') // die "Unable to prepare mrna_cds statement: ".$dbh->errstr; +SELECT * FROM mrna_cds_table WHERE cds_gi=?; +END + +my $sth_allele_freq = $dbh->prepare(<<'END') // die "Unable to prepare allele_freq statement: ".$dbh->errstr; +SELECT ssl.snp_id AS snp_id, + si.subsnp_id AS subsnp_id, + si.chr AS chr, + si.submitted_ind_id AS submitted_ind_id, + ga.chr_num AS chr_num, + ga.rev_flag AS rev_flag, + a.allele AS allele + FROM snpsubsnplink ssl + JOIN subind si on ssl.subsnp_id=si.subsnp_id + JOIN gtyallele ga on si.gty_id=ga.gty_id + JOIN allele a on ga.fwd_allele_id = a.allele_id + WHERE ssl.snp_id=? AND + si.chr=? AND ga.rev_flag = '0' AND + a.allele <> 'N'; +END + +my $sth_rsid_info = $dbh->prepare(<<'END') // die "Unable to prepare rsid info statement: ".$dbh->errstr; +SELECT * FROM cds WHERE start<=? AND stop>=? AND chr=? ORDER BY variant; +END + +my $sth_near_gene_start = $dbh->prepare(<<'END') // die "Unable to prepare near gene start statement: ".$dbh->errstr; +SELECT cds.start AS start, + cds.stop AS stop, + cds.gene AS symbol, + gitn.gene_name AS description + FROM cds + LEFT OUTER JOIN geneidtoname gitn ON cds.geneid=gitn.gene_id + WHERE cds.chr = ? AND cds.start > ? + ORDER BY cds.start ASC LIMIT 1; +END + +my $sth_near_gene_stop = $dbh->prepare(<<'END') // die "Unable to prepare near gene stop statement: ".$dbh->errstr; +SELECT cds.start AS start, + cds.stop AS stop, + cds.gene AS symbol, + gitn.gene_name AS description + FROM cds + LEFT OUTER JOIN geneidtoname gitn ON cds.geneid=gitn.gene_id + WHERE cds.chr = ? AND cds.stop < ? + ORDER BY cds.stop DESC LIMIT 1; +END + +my $prot_table; + +while () { + chomp; + my ($codon,$aa,$aa_med,$aa_long) = split /\t/; + $prot_table->{$codon} = $aa; +} + +my @headers = qw(id chr pos ref alt); +my @row_keys_to_initialize = qw(gene nonsyn aa_ref aa_alt codon_ref codon_alt cdspos protpos loc num var acc phase reffreq altfreq refaltn); +print join("\t",@headers,'orig_id',@row_keys_to_initialize)."\n"; +while (<>) { + print $_ and next if /^#/; + chomp; + my @row = split /\t/, $_; + my %row; + @row{@headers} = @row[0..$#headers]; + $row{orig_id} = $row{id}; + # this is taken directly from vcf_fill_rsid; we want it for the time being anyway + if ($row{id} !~ /^(?:rs|ss)?\d+(?:,\s*(?:rs|ss)?\d+)*$/) { + my $new_id = find_rsid(chr => fixup_chr($row{chr}), + # snps are 0 indexed + pos => $row{pos} - 1, + dbh => $dbh, + sth => $sth, + ); + if ($new_id !~ /^(?:[\.\?])$/) { + $row{id} = "rs".$new_id; + } + } + if ($row{id} =~ /(rs|ss)(\d+)/) { + my $type = $1; + my $id = $2; + $id = find_real_snp_id(id => $id, + dbh => $dbh, + sth => $real_snp_sth, + ); + if (not defined $id and + is_retired_snp(id => $id, + dbh => $dbh, + sth => $sth_snp_history, + )) { + if (defined $row{chr} and defined $row{pos}) { + $row{id} = "$row{chr}-$row{pos}"; + } + } + else { + $row{id} = "$type$id"; + } + } + + if (defined $row{id} and ($options{fix_pos} or + not defined $row{pos} or + not defined $row{chr} or + not defined $row{ref} or + not defined $row{alt})) { + my %info = find_chr_pos(id => [$row{id} =~ /(?:rs|ss)(\d+)/]->[0], + dbh => $dbh, + sth => $snp_chr_pos_sth, + ); + for my $inf (qw(pos chr ref alt)) { + $row{$inf} = $info{$inf} if $options{fix_pos} and defined $info{$inf} or not defined $row{$inf}; + } + # handle partially aligned snps + if ($options{fix_pos} and not defined $row{pos}) { + $row{pos} = $info{pos2}; + } + if (not defined $row{pos} or not defined $row{chr}) { + if (defined $row{pos}) { + print STDERR "$row{id} is ambiguously placed\n"; + } + else { + print STDERR "Unable to find a position/chr for $row{id}\n"; + } + } + } + my $mrna_information = mrna_information(dbh => $dbh, + sth => $sth_mrna_ss, + # these are all 1 indexed + pos => $row{pos}, + chr => fixup_chr($row{chr}), + ref => $row{ref}, + alt => $row{alt}, + cds_sth => $sth_cds_ss, + mrna_cds_sth => $sth_mrna_cds, + ); + @row{@row_keys_to_initialize} = ('') x @row_keys_to_initialize; + if (keys %{$mrna_information}) { + # strip out existing information in $row{info} + my @extra_info; + $row{gene} = $mrna_information->{gene}; + $row{nonsyn} = 1 if defined $mrna_information->{nonsyn} and $mrna_information->{nonsyn}; + $row{aa_ref} = $mrna_information->{orig_aa} if defined $mrna_information->{orig_aa}; + $row{aa_alt} = $mrna_information->{new_aa} if defined $mrna_information->{new_aa}; + $row{codon_ref} = $mrna_information->{orig_codon} if defined $mrna_information->{orig_codon}; + $row{codon_alt} = $mrna_information->{new_codon} if defined $mrna_information->{new_codon}; + $row{cdspos} = $mrna_information->{cds_pos} if defined $mrna_information->{cds_pos}; + $row{protpos} = $mrna_information->{prot_pos} if defined $mrna_information->{prot_pos}; + $row{loc} = $mrna_information->{exon}?'exon':'intron' if defined $mrna_information->{gene}; + $row{num} = $mrna_information->{number}; + $row{var} = $mrna_information->{variant} if defined $mrna_information->{variant}; + $row{acc} = $mrna_information->{accession}; + $row{phase} = $mrna_information->{phase} if defined $mrna_information->{phase}; + } + if ((not defined $row{gene} or not length $row{gene}) + and $options{nearest_gene}) { + $row{gene} = find_nearest_gene(dbh => $dbh, + start_sth => $sth_near_gene_start, + stop_sth => $sth_near_gene_stop, + chr => fixup_chr($row{chr}), + pos => $row{pos}, + ); + } + my $allele_freq_information = allele_freq_information(dbh => $dbh, + sth => $sth_allele_freq, + id => [$row{id} =~ /(?:rs|ss)(\d+)/]->[0], + # these are all 1 indexed + pos => $row{pos}, + chr => fixup_chr($row{chr}), + ref => $row{ref}, + alt => $row{alt}, + ) if defined $row{id} and $row{id} =~ /^(?:rs|ss)/; + if (keys %{$allele_freq_information}) { + # strip out existing information in $row{info} + if (exists $allele_freq_information->{alleles} and keys %{$allele_freq_information->{alleles}}) { + $row{reffreq} = sprintf('%.3f',allele_freq($allele_freq_information->{alleles},$row{ref})); + $row{altfreq} = sprintf('%.3f',allele_freq($allele_freq_information->{alleles},$row{alt})); + $row{refaltn} = $allele_freq_information->{alleles}{total}; + } + } + print join("\t",map {defined $_?$_:'NA'} @row{@headers,'orig_id',@row_keys_to_initialize},$#row > $#headers?@row[@headers..$#row]:())."\n"; + if ($options{verbose} > 0 and $. % 100 == 0) { + print STDERR "At line $.\n"; + } +} + +sub find_real_snp_id { + my %param = @_; + + my %info; + my $rv = $param{sth}->execute($param{id},$param{id}) // + die "Unable to execute statement properly: ".$param{dbh}->errstr; + my ($real_id) = map {ref $_ ?@{$_}:()} map {ref $_ ?@{$_}:()} $param{sth}->fetchall_arrayref([0]); + if ($param{sth}->err) { + print STDERR $param{sth}->errstr; + $param{sth}->finish; + return $param{id}; + } + $param{sth}->finish; + return $real_id // $param{id}; +} + +sub find_chr_pos { + my %param = @_; + + my %info; + my $rv = $param{sth}->execute($param{id}) // + die "Unable to execute statement properly: ".$param{dbh}->errstr; + my $results = $param{sth}->fetchall_arrayref({}); + if ($param{sth}->err) { + print STDERR $param{sth}->errstr; + $param{sth}->finish; + return %info; + } + for my $result (@{$results}) { + %info = %{$result}; + } + if (keys %info) { + if ($info{chr} eq 'NotOn') { + undef $info{chr}; + } + if ($info{orien}) { + $info{orien_var_str} = $info{rev_var_str}; + } + else { + $info{orien_var_str} = $info{var_str}; + } + my @alleles = split qr{/},$info{orien_var_str}; + ($info{alt}) = grep {$_ ne $info{ref}} @alleles; + # fix $row{pos} + $info{pos}+=1; + } + $param{sth}->finish; + return %info; +} + +sub find_nearest_gene { + my %param = @_; + return undef unless defined $param{chr} and defined $param{pos}; + + my %possible_genes; + + for my $st (qw(start stop)) { + my $rv = $param{"${st}_sth"}->execute($param{chr},$param{pos}) // + die "Unable to execute nearest gene $st statement properly: ".$param{dbh}->errstr; + my $results = $param{"${st}_sth"}->fetchall_arrayref({}); + if ($param{"${st}_sth"}->err) { + print STDERR $param{"${st}_sth"}->errstr; + $param{"${st}_sth"}->finish; + return ''; + } + $possible_genes{$st} = $results->[0] + } + for my $st (qw(start stop)) { + # opposite of start/stop (stop/start) + my $st_op = $st eq q(start) ? q(stop) : q(start); + if (not defined $possible_genes{$st}{start}) { + if (defined $possible_genes{$st_op}{start}) { + return $possible_genes{$st_op}{symbol} + } + else { + return undef; + } + } + } + my $gene; + # if the start of the start gene is closer to the position of the + # snp than the stop of the stop gene, it's the nearest gene; + # otherwise, the stop gene is + if ($possible_genes{start}{start} - $param{pos} < + $param{pos} - $possible_genes{stop}{stop} + ) { + return $possible_genes{start}{symbol}; + } + else { + return $possible_genes{stop}{symbol}; + } +} + + +sub mrna_information{ + my %param = @_; + + my $rv = $param{sth}->execute($param{pos},$param{pos},$param{chr}) or + die "Unable to execute statement properly: ".$param{dbh}->errstr; + + my $results = $param{sth}->fetchall_arrayref({}); + if ($param{sth}->err) { + print STDERR $param{sth}->errstr; + return {}; + } + my $chosen_result; + for my $result (@{$results}) { + if (not defined $chosen_result) { + $chosen_result = $result; + next; + } + if (not defined $chosen_result->{variant} or + (defined $result->{variant} and + $chosen_result->{variant} > $result->{variant})) { + $chosen_result = $result; + next; + } + } + if (defined $chosen_result and $chosen_result->{exon}) { + # try to figure out the cds if there is mrna information and + # we're in an exon + my $cds_rv = $param{cds_sth}->execute($param{pos},$param{pos},$param{chr}) or + die "Unable to execute statement properly: ".$param{dbh}->errstr; + my $cds_results = $param{cds_sth}->fetchall_arrayref({}); + if ($param{cds_sth}->err) { + print STDERR $param{cds_sth}->errstr; + } + else { + print STDERR Data::Dumper->Dump([$cds_results],[qw(cds_results)]) if $DEBUG; + my $chosen_cds; + for my $cds_result (@{$cds_results}) { + if (not defined $chosen_cds) { + $chosen_cds = $cds_result; + next; + } + if (not defined $cds_result->{variant} or + (defined $chosen_cds->{variant} and + $chosen_cds->{variant} ge $cds_result->{variant})) { + $chosen_cds = $cds_result; + next; + } + } + if (defined $chosen_cds and keys %{$chosen_cds}) { + my $comp_offset = $chosen_cds->{complement}?-1:1; + $chosen_result->{in_cds} = 1; + $chosen_result->{cds_pos} = + ($chosen_cds->{complement}?$chosen_cds->{stop}-$param{pos}:$param{pos}-$chosen_cds->{start}) + + $chosen_cds->{position}; + $chosen_result->{gene} = $chosen_cds->{gene}; + $chosen_result->{pos} = $chosen_cds->{pos}; + $chosen_result->{number} = $chosen_cds->{number}; + $chosen_result->{variant} = $chosen_cds->{variant}; + $chosen_result->{acc} = $chosen_cds->{accession}; + $chosen_result->{prot_pos} = sprintf("%.1f",$chosen_result->{cds_pos}/ 3); + $chosen_result->{phase} = ($chosen_cds->{phase} + $param{pos} - $chosen_cds->{start}) % 3; + print STDERR Data::Dumper->Dump([$chosen_cds,$chosen_result],[qw(chosen_cds chosen_result)]) if $DEBUG; + my $mrna_cds_rv = $param{mrna_cds_sth}->execute($chosen_cds->{gi}) or + die "Unable to execute statement properly: ".$param{dbh}->errstr; + my $mrna_cds_result = $param{mrna_cds_sth}->fetchrow_hashref(); + if (defined $mrna_cds_result and keys %{$mrna_cds_result}) { + print STDERR Data::Dumper->Dump([$mrna_cds_result,$chosen_result,\%param], + [qw(mrna_cds_result chosen_result *param)]) if $DEBUG; + my $codon = substr $mrna_cds_result->{sequence},$chosen_result->{cds_pos} - (($chosen_result->{cds_pos}) % 3) ,3; + my $orig_aa = convert_to_prot_seq($codon); + my $modified_codon = $codon; + substr($modified_codon,$chosen_result->{cds_pos} % 3,1)= + ($chosen_cds->{complement}?dna_complement($param{alt}):$param{alt}); + my $new_aa = convert_to_prot_seq($modified_codon); + print STDERR Dumper($mrna_cds_result,$chosen_result,$chosen_cds,{codon_pos => $chosen_result->{cds_pos} - ($chosen_result->{cds_pos} % 3), + codon=>$codon,modified_codon=>$modified_codon,pos=>$param{pos}}) if $DEBUG; + if ($DEBUG > 1) { + my $a = -1; + print STDERR map {$a++; ($a*3)." $_\n"} $mrna_cds_result->{sequence} =~ m/.../g; + } + $chosen_result->{orig_aa} = $orig_aa; + $chosen_result->{new_aa} = $new_aa; + $chosen_result->{new_codon} = $modified_codon; + $chosen_result->{orig_codon} = $codon; + $chosen_result->{nonsyn} = $new_aa ne $orig_aa ? 1 : 0; + } + } + } + } + return $chosen_result // {}; +} + + +sub fixup_chr { + my ($chr) = @_; + return $chr if not defined $chr; + + $chr =~ s/^NC_00000?//; + $chr =~ s/^chr//; + if (looks_like_number($chr)) { + if ($chr == 23) { + $chr = "X" + } + elsif ($chr == 24) { + $chr = "Y" + } + } + return ($chr); +} + +sub allele_freq_information{ + my %param = @_; + + return {} if $param{id} !~ /^\d+$/; + + my $rv = $param{sth}->execute($param{id},$param{chr}) or + die "Unable to execute statement properly: ".$param{dbh}->errstr." id:".$param{id}." chr:".$param{chr}; + + my $results = $param{sth}->fetchall_arrayref({}); + if ($param{sth}->err) { + print STDERR $param{sth}->errstr; + return {}; + } + my %pop_alleles; + my %alleles; + my %seen_indiv; + my %indivs; + for my $result (@{$results}) { + next if exists $seen_indiv{$result->{submitted_ind_id}.'.'.$result->{chr_num}}; + $indivs{$result->{submitted_ind_id}}{'chr_'.$result->{chr_num}} = $result->{allele}; + if (defined $result->{loc_pop_id}) { + $indivs{$result->{submitted_ind_id}}{loc_pop_id} = $result->{loc_pop_id}; + $indivs{$result->{submitted_ind_id}}{pop_handle} = $result->{pop_handle}; + $pop_alleles{$result->{loc_pop_id}}{$result->{allele}}++; + $pop_alleles{$result->{loc_pop_id}}{total}++; + } + $alleles{$result->{allele}}++; + $alleles{total}++; + $seen_indiv{$result->{submitted_ind_id}.'.'.$result->{chr_num}} = 1; + } + my %genotypes; + my %pop_genotypes; + for my $indiv (values %indivs) { + if (defined $indiv->{chr_1} and defined $indiv->{chr_2}) { + my $genotype = join('/', + sort($indiv->{chr_1}, + $indiv->{chr_2})); + if (defined $indiv->{loc_pop_id}) { + $pop_genotypes{$indiv->{loc_pop_id}}{$genotype}++; + $pop_genotypes{$indiv->{loc_pop_id}}{total}++; + } + $genotypes{$genotype}++; + $genotypes{total}++; + } + } + return {pop_alleles => \%pop_alleles, + pop_genotypes => \%pop_genotypes, + genotypes => \%genotypes, + alleles => \%alleles, + }; +} + +sub allele_freq{ + my ($freq_table,$allele) = @_; + my $flipped_allele = $allele; + $flipped_allele =~ tr/ACTG/TGAC/; + if (defined $allele) { + if (exists $freq_table->{$allele} and defined $freq_table->{$allele}) { + return $freq_table->{$allele}/$freq_table->{total}; + } + elsif (exists $freq_table->{$flipped_allele} and defined $freq_table->{$flipped_allele}) { + return $freq_table->{$flipped_allele}/$freq_table->{total}; + } + return 0; + } + my @alleles = grep {$_ ne 'total'} keys %{$freq_table}; + return join(',',map {$_.':'.(defined $freq_table->{$_} ? + $freq_table->{$_}/$freq_table->{total} : 0)} @alleles); +} + + +sub find_rsid { + my %param = @_; + + my $rv = $param{sth}->execute($param{chr},$param{pos}) // + die "Unable to execute statement properly: ".$param{dbh}->errstr; + my @snp_ids = grep {$_ =~ /^\d+$/} + map {ref $_ eq 'ARRAY'? @{$_}:$_} + map {ref $_ eq 'ARRAY'? @{$_}:$_} + $sth->fetchall_arrayref([0]); + if ($sth->err) { + print STDERR $sth->errstr; + return '?'; + } + if (not @snp_ids) { + return '.'; + } + return join(',',@snp_ids); +} + +sub dna_complement{ + my ($seq) = @_; + return join('',map {tr /ATCG/TAGC/; $_} reverse split //,$seq); +} + +sub convert_to_prot_seq{ + my ($mrna) = @_; + $mrna =~ s/U/T/g; + my $prot = ''; + for my $pos (0..(length($mrna) / 3 - 1)) { + my $codon = substr($mrna,$pos*3,3); + if (not exists $prot_table->{$codon}) { + # this isn't quite right, as we should really be handling + # ambiguities which do not result in different amino + # acids, but for the time being, do it this way. + $prot .= 'N'; + } + else { + $prot .= $prot_table->{$codon}; + } + } + return $prot; +} + + +__DATA__ +ATT I Ile Isoleucine +ATC I Ile Isoleucine +ATA I Ile Isoleucine +CTT L Leu Leucine +CTC L Leu Leucine +CTA L Leu Leucine +CTG L Leu Leucine +TTA L Leu Leucine +TTG L Leu Leucine +GTT V Val Valine +GTC V Val Valine +GTA V Val Valine +GTG V Val Valine +TTT F Phe Phenylalanine +TTC F Phe Phenylalanine +ATG M Met Methionine +TGT C Cys Cysteine +TGC C Cys Cysteine +GCT A Ala Alanine +GCC A Ala Alanine +GCA A Ala Alanine +GCG A Ala Alanine +GGT G Gly Glycine +GGC G Gly Glycine +GGA G Gly Glycine +GGG G Gly Glycine +CCT P Pro Proline +CCC P Pro Proline +CCA P Pro Proline +CCG P Pro Proline +ACT T Thr Threonine +ACC T Thr Threonine +ACA T Thr Threonine +ACG T Thr Threonine +TCT S Ser Serine +TCC S Ser Serine +TCA S Ser Serine +TCG S Ser Serine +AGT S Ser Serine +AGC S Ser Serine +TAT Y Tyr Tyrosine +TAC Y Tyr Tyrosine +TGG W Trp Tryptophan +CAA Q Gln Glutamine +CAG Q Gln Glutamine +AAT N Asn Asparagine +AAC N Asn Asparagine +CAT H His Histidine +CAC H His Histidine +GAA E Glu Glutamic acid +GAG E Glu Glutamic acid +GAT D Asp Aspartic acid +GAC D Asp Aspartic acid +AAA K Lys Lysine +AAG K Lys Lysine +CGT R Arg Arginine +CGC R Arg Arginine +CGA R Arg Arginine +CGG R Arg Arginine +AGA R Arg Arginine +AGG R Arg Arginine +TAA * * Stop codon +TAG * * Stop codon +TGA * * Stop codon diff --git a/bin/snp_study_table b/bin/snp_study_table new file mode 100755 index 0000000..6a8d887 --- /dev/null +++ b/bin/snp_study_table @@ -0,0 +1,200 @@ +#! /usr/bin/perl +# snp_study_table outputs a table of study results, and is released +# under the terms of the GPL version 2, or any later version, at your +# option. See the file README and COPYING for more information. +# Copyright 2011 by Don Armstrong . +# $Id: perl_script 1825 2011-01-02 01:53:43Z don $ + + +use warnings; +use strict; + +use Getopt::Long; +use Pod::Usage; + +=head1 NAME + +snp_study_table - output an xls table of study results + +=head1 SYNOPSIS + +snp_study_table [options] [snps] + + Options: + --output output filename (Default STDOUT) + --debug, -d debugging level (Default 0) + --help, -h display this help + --man, -m display manual + +=head1 SNPS + +SNPS can be specified as rs12345 or as chr-pos; in the later case, the +ga_chr_pos table will be used, in the former, the ga_snp table will be +used. + +=head1 OPTIONS + +=over + +=item B<--output> + +Output filename (defaults to STDOUT) + +=item B<--debug, -d> + +Debug verbosity. (Default 0) + +=item B<--help, -h> + +Display brief usage information. + +=item B<--man, -m> + +Display this manual. + +=back + +=head1 EXAMPLES + + +=cut + + +use vars qw($DEBUG); + +use DBI; +use Spreadsheet::WriteExcel; + +my %options = (debug => 0, + help => 0, + man => 0, + ); + +GetOptions(\%options, + 'output|o=s', + 'debug|d+','help|h|?','man|m'); + +pod2usage() if $options{help}; +pod2usage({verbose=>2}) if $options{man}; + +$DEBUG = $options{debug}; + +my @USAGE_ERRORS; + +pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS; + +my $out_fh = \*STDOUT; +if (exists $options{output}) { + $out_fh = IO::File->new($options{output},'w') or + die "Unable to open $options{output} for writing: $!"; +} +my $wb = Spreadsheet::WriteExcel->new($out_fh); +my $ws = $wb->add_worksheet('snp_results'); + +my $dbh = DBI->connect("dbi:Pg:service=snp",'','',{AutoCommit => 0}) or + die "Unable to connect to database: ".$DBI::errstr; + +my $sth_ga_info = $dbh->prepare(<<'END') // die "Unable to prepare ga_snp info statement: ".$dbh->errstr; +SELECT * FROM ga_snp WHERE snp_id=?; +END + +my $sth_ga_cp_info = $dbh->prepare(<<'END') // die "Unable to prepare ga_snp info statement: ".$dbh->errstr; +SELECT * FROM ga_chr_pos WHERE chr=? AND pos=?; +END + + +if (not @ARGV) { + while () { + next if /^#/; + chomp; + push @ARGV,$_; + } +} + +my %studies; +my %snps; +for my $snp (@ARGV) { + my $results; + if ($snp =~ /^(?:rs)(\d+)$/) { + $results = get_snp_results(dbh => $dbh, + sth => $sth_ga_info, + bind => [$1], + ); + } + elsif ($snp =~ /^(\d+)-(\d+)$/) { + $results = get_snp_results(dbh => $dbh, + sth => $sth_ga_cp_info, + bind => [$1,$2], + ); + } + else { + print STDERR "Invalid snp format '$snp'\n"; + next; + } + $snps{$snp} = $results; + for my $result (@{$results}) { + $studies{$result->{study_name}}{$result->{subpart_name}} = 1; + } +} +$dbh->disconnect(); + +#column names +my @cn = ('A'..'Z','AA'..'ZZ'); + +my $r = 1; +my $c = 0; +my $f_center = $wb->add_format(align => 'center'); +$ws->write($cn[$c++].($r+2),'SNP'); +for my $study (sort keys %studies) { + my $n_substudy = keys %{$studies{$study}}; + $ws->write($cn[$c].$r,$study); + $ws->merge_range($r-1,$c,$r-1,$c+$n_substudy*3-1,$study,$f_center); + for my $substudy (sort keys %{$studies{$study}}) { + $ws->write($cn[$c].($r+1),$substudy); + $ws->merge_range($r,$c,$r,$c+2,$substudy,$f_center); + $ws->write($cn[$c].($r+2),'P value'); + $ws->write($cn[$c+1].($r+2),'Q value'); + $ws->write($cn[$c+2].($r+2),'FDR'); + $c+=3; + } +} +$r+=3; +$c=0; +for my $snp (sort keys %snps) { + $ws->write($cn[$c++].$r,$snp); + my %snp_studies; + for my $row (@{$snps{$snp}}) { + $snp_studies{$row->{study_name}}{$row->{subpart_name}} = [@{$row}{qw(pvalue qvalue fdr)}]; + } + for my $study (sort keys %studies) { + for my $substudy (sort keys %{$studies{$study}}) { + if (not exists $snp_studies{$study} or + not exists $snp_studies{$study}{$substudy} + ) { + $c+=3; + } + else { + for (0..2) { + $ws->write($cn[$c++].$r,defined $snp_studies{$study}{$substudy}[$_]?$snp_studies{$study}{$substudy}[$_]:''); + } + } + } + } + $r++; + $c=0; +} + +sub get_snp_results{ + my %param = @_; + my $rv = $param{sth}->execute(@{$param{bind}}) or + die "Unable to execute statement properly: ".$param{dbh}->errstr; + my $results = $param{sth}->fetchall_arrayref({}); + if ($param{sth}->err) { + print STDERR $param{sth}->errstr; + return {}; + } + return $results; +} + + +__END__ diff --git a/bin/vcf_add_snp_info b/bin/vcf_add_snp_info new file mode 100755 index 0000000..db3dcb6 --- /dev/null +++ b/bin/vcf_add_snp_info @@ -0,0 +1,611 @@ +#! /usr/bin/perl +# vcf_add_snp_info adds additional snp information to vcf files, and +# is released under the terms of the GPL version 2, or any later +# version, at your option. See the file README and COPYING for more +# information. +# Copyright 2011 by Don Armstrong . + + +use warnings; +use strict; + +BEGIN {$|=1}; + +use Getopt::Long; +use Pod::Usage; + +=head1 NAME + +vcf_add_snp_info - Add additional snp information to vcf files + +=head1 SYNOPSIS + + [options] + + Options: + --debug, -d debugging level (Default 0) + --help, -h display this help + --man, -m display manual + +=head1 OPTIONS + +=over + +=item B<--debug, -d> + +Debug verbosity. (Default 0) + +=item B<--help, -h> + +Display brief usage information. + +=item B<--man, -m> + +Display this manual. + +=back + +=head1 EXAMPLES + +vcf_fill_rsid foo.vcf > bar.vcf + +=cut + + +use vars qw($DEBUG); +use DBI; +use Data::Dumper; +use Scalar::Util qw(looks_like_number); + + +my %options = (debug => 0, + help => 0, + man => 0, + verbose => 0, + build => '135', + refseq => '37_3', + ); + +GetOptions(\%options, + 'build', + 'refseq', + 'verbose|v+', + 'debug|d+','help|h|?','man|m'); + +pod2usage() if $options{help}; +pod2usage({verbose=>2}) if $options{man}; + +$DEBUG = $options{debug}; + +my @USAGE_ERRORS; +# if (1) { +# push @USAGE_ERRORS,"You must pass something"; +# } + +pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS; + + +my $dbh = DBI->connect("dbi:Pg:service=snp",'','',{AutoCommit => 0}) or + die "Unable to connect to database: ".$DBI::errstr; + +my $sth = $dbh->prepare(<<"END") // die "Unable to prepare snpinfo statement: ".$dbh->errstr; +SELECT snp_id FROM b$options{build}_snpchrposonref_$options{refseq} WHERE chr=? AND pos=?; +END + +my $real_snp_sth = $dbh->prepare(<<'END') // die "Unable to prepare realsnp statement: ".$dbh->errstr; +SELECT DISTINCT rscurrent FROM rsmergearch WHERE rslow=? OR rshigh=? +END + +my $sth_mrna_ss = $dbh->prepare(<<'END') // die "Unable to prepare mrna_ss statement: ".$dbh->errstr; +SELECT * FROM mrna WHERE start<=? AND stop>=? AND chr=? ORDER BY variant; +END + +my $sth_cds_ss = $dbh->prepare(<<'END') // die "Unable to prepare cds_ss statement: ".$dbh->errstr; +SELECT * FROM cds WHERE start<=? AND stop>=? AND chr=? ORDER BY variant; +END + +my $sth_mrna_cds = $dbh->prepare(<<'END') // die "Unable to prepare mrna_cds statement: ".$dbh->errstr; +SELECT * FROM mrna_cds_table WHERE cds_gi=?; +END + +my $sth_snp_history = $dbh->prepare(<<'END') // die "Unable to prepare snp_history statement: ".$dbh->errstr; +SELECT * FROM snphistory WHERE snp_id=?; +END + + +my $sth_allele_freq = $dbh->prepare(<<'END') // die "Unable to prepare allele_freq statement: ".$dbh->errstr; +SELECT ssl.snp_id AS snp_id, + si.subsnp_id AS subsnp_id, +-- ssl.substrand_reversed_flag, + si.chr AS chr, + si.submitted_ind_id AS submitted_ind_id, + ga.chr_num AS chr_num, +-- si.submitted_strand_code, +-- si.allele_flag, + ga.rev_flag AS rev_flag, + a.allele AS allele, + ra.allele AS rev_allele, + p.handle AS pop_handle, + p.loc_pop_id AS loc_pop_id + FROM snpsubsnplink ssl + JOIN subind si on ssl.subsnp_id=si.subsnp_id + JOIN gtyallele ga on si.gty_id=ga.gty_id + JOIN allele a on ga.fwd_allele_id = a.allele_id + JOIN allele ra on a.rev_allele_id = ra.allele_id + JOIN submittedindividual sind on si.submitted_ind_id=sind.submitted_ind_id + JOIN population p on sind.pop_id=p.pop_id + WHERE ssl.snp_id=? AND + si.chr=? AND ga.rev_flag = '0' AND + a.allele <> 'N'; +END + +my $sth_rsid_info = $dbh->prepare(<<'END') // die "Unable to prepare rsid info statement: ".$dbh->errstr; +SELECT * FROM cds WHERE start<=? AND stop>=? AND chr=? ORDER BY variant; +END + +my $sth_ga_info = $dbh->prepare(<<'END') // die "Unable to prepare ga_snp info statement: ".$dbh->errstr; +SELECT * FROM ga_snp WHERE snp_id=?; +END + +my $prot_table; + +while () { + chomp; + my ($codon,$aa,$aa_med,$aa_long) = split /\t/; + $prot_table->{$codon} = $aa; +} + +my @headers = qw(chr pos id ref alt qual filter info format); +while (<>) { + print $_ and next if /^#/; + chomp; + my @row = split /\t/, $_; + my %row; + @row{@headers} = @row[0..$#headers]; + # this is taken directly from vcf_fill_rsid; we want it for the time being anyway + if ($row{id} !~ /^(?:rs|ss)?\d+(?:,\s*(?:rs|ss)?\d+)*$/) { + $row{id} = find_rsid(chr => fixup_chr($row{chr}), + # snps are 0 indexed + pos => $row{pos} - 1, + dbh => $dbh, + sth => $sth, + ); + } + if ($row{id} =~ /(rs|ss)(\d+)/) { + my $type = $1; + my $id = $2; + $id = find_real_snp_id(id => $id, + dbh => $dbh, + sth => $real_snp_sth, + ); + if (not defined $id and + is_retired_snp(id => $id, + dbh => $dbh, + sth => $sth_snp_history, + )) { + if (defined $row{chr} and defined $row{pos}) { + $row{id} = undef; + } + } + else { + $row{id} = "$type$id"; + } + } + my $mrna_information = mrna_information(dbh => $dbh, + sth => $sth_mrna_ss, + # these are all 1 indexed + pos => $row{pos}, + chr => fixup_chr($row{chr}), + ref => $row{ref}, + alt => $row{alt}, + cds_sth => $sth_cds_ss, + mrna_cds_sth => $sth_mrna_cds, + ); + if (keys %{$mrna_information}) { + # strip out existing information in $row{info} + $row{info} =~ s{(?:GENE|(?:CDS|PROT|)POS|NUM|VAR|ACC|NONSYN|(?:ALT|REF)(?:AA|CODON))\=(?:[^\;]+|"[^"]+")(?:;|$)}{}g; + my @extra_info; + push @extra_info,"GENE=".$mrna_information->{gene}; + push @extra_info,"NONSYN=".1 if defined $mrna_information->{nonsyn} and $mrna_information->{nonsyn}; + push @extra_info,"REFAA=".$mrna_information->{orig_aa} if defined $mrna_information->{orig_aa}; + push @extra_info,"ALTAA=".$mrna_information->{new_aa} if defined $mrna_information->{new_aa}; + push @extra_info,"REFCODON=".$mrna_information->{orig_codon} if defined $mrna_information->{orig_codon}; + push @extra_info,"ALTCODON=".$mrna_information->{new_codon} if defined $mrna_information->{new_codon}; + push @extra_info,"CDSPOS=".$mrna_information->{cds_pos} if defined $mrna_information->{cds_pos}; + push @extra_info,"PROTPOS=".$mrna_information->{prot_pos} if defined $mrna_information->{prot_pos}; + push @extra_info,"POS=".($mrna_information->{exon}?'exon':'intron'); + push @extra_info,"NUM=".$mrna_information->{number}; + push @extra_info,"VAR=".$mrna_information->{variant} if defined $mrna_information->{variant}; + push @extra_info,"ACC=".$mrna_information->{accession}; + push @extra_info,"ACCVER=".$mrna_information->{accession_ver}; + push @extra_info,"PHASE=".$mrna_information->{phase} if defined $mrna_information->{phase}; + $row{info} = join(';',@extra_info,$row{info}); + } + my $allele_freq_information = allele_freq_information(dbh => $dbh, + sth => $sth_allele_freq, + id => $row{id}, + # these are all 1 indexed + pos => $row{pos}, + chr => fixup_chr($row{chr}), + ref => $row{ref}, + alt => $row{alt}, + ); + if (keys %{$allele_freq_information}) { + # strip out existing information in $row{info} + $row{info} =~ s{(?:REFFREQ|ALTFREQ|REFALTN)\=(?:[^\;]+|"[^"]+")(?:;|$)}{}g; + my @extra_info; + if (exists $allele_freq_information->{alleles} and keys %{$allele_freq_information->{alleles}}) { + push @extra_info,"REFFREQ=".sprintf('%.3f',allele_freq($allele_freq_information->{alleles},$row{ref})); + push @extra_info,"ALTFREQ=".sprintf('%.3f',allele_freq($allele_freq_information->{alleles},$row{alt})); + push @extra_info,"REFALTN=".$allele_freq_information->{alleles}{total}; + } + $row{info} = join(';',@extra_info,$row{info}); + } + my $ga_study_information = ga_study_information(dbh => $dbh, + sth => $sth_ga_info, + id => $row{id}, + ); + if (defined $ga_study_information) { + # strip out existing information in $row{info} + $row{info} =~ s{(?:GASTUDY|GASTUDYSIG)\=(?:[^\;]+|"[^"]+")(?:;|$)}{}g; + my @extra_info; + push @extra_info,"GASTUDY=".$ga_study_information->{studies}; + push @extra_info,"GASTUDYSIG=".($ga_study_information->{significant}?'TRUE':'FALSE'); + $row{info} = join(';',@extra_info,$row{info}); + } + print join("\t",@row{@headers},$#row > $#headers?@row[@headers..$#row]:())."\n"; + if ($options{verbose} > 0 and $. % 100 == 0) { + print STDERR "At line $.\n"; + } +} + +sub ga_study_information { + my %param = @_; + + return undef unless $param{id} =~ /^\d+$/; + my $rv = $param{sth}->execute($param{id}) or + die "Unable to execute statement properly: ".$param{dbh}->errstr; + my $results = $param{sth}->fetchall_arrayref({}); + if (not @{$results}) { + return undef; + } + my $any_significant=0; + my @values; + for my $result (sort { + if (looks_like_number($a->{pvalue})) { + if (looks_like_number($b->{pvalue})) { + $a->{pvalue} <=> $b->{pvalue}; + } else { + -1; + } + } elsif (looks_like_number($b->{pvalue})) { + 1; + } else { + 0; + }} @{$results}) { + if ($result->{significant}) { + $any_significant=1; + } + my $value = $result->{study_name}.':'.$result->{subpart_name}; + for my $t (qw(pvalue qvalue fdr)) { + next unless exists $result->{$t} and defined $result->{$t} and length $result->{$t}; + my ($fl) = $t =~ m/^(.)/; + $value .= ":$fl:".sprintf('%.3g',$result->{$t}); + } + push @values, $value; + } + return {significant => $any_significant, + studies => join(',',@values), + }; + +} + +sub mrna_information{ + my %param = @_; + + my $rv = $param{sth}->execute($param{pos},$param{pos},$param{chr}) or + die "Unable to execute statement properly: ".$param{dbh}->errstr; + + my $results = $param{sth}->fetchall_arrayref({}); + if ($param{sth}->err) { + print STDERR $param{sth}->errstr; + return {}; + } + my $chosen_result; + for my $result (@{$results}) { + if (not defined $chosen_result) { + $chosen_result = $result; + next; + } + if (not defined $chosen_result->{variant} or + (defined $result->{variant} and + $chosen_result->{variant} > $result->{variant})) { + $chosen_result = $result; + next; + } + } + if (defined $chosen_result and $chosen_result->{exon}) { + # try to figure out the cds if there is mrna information and + # we're in an exon + my $cds_rv = $param{cds_sth}->execute($param{pos},$param{pos},$param{chr}) or + die "Unable to execute statement properly: ".$param{dbh}->errstr; + my $cds_results = $param{cds_sth}->fetchall_arrayref({}); + if ($param{cds_sth}->err) { + print STDERR $param{cds_sth}->errstr; + } + else { + print STDERR Data::Dumper->Dump([$cds_results],[qw(cds_results)]) if $DEBUG; + my $chosen_cds; + for my $cds_result (@{$cds_results}) { + if (not defined $chosen_cds) { + $chosen_cds = $cds_result; + next; + } + if (not defined $cds_result->{variant} or + (defined $chosen_cds->{variant} and + $chosen_cds->{variant} ge $cds_result->{variant})) { + $chosen_cds = $cds_result; + next; + } + } + if (defined $chosen_cds and keys %{$chosen_cds}) { + my $comp_offset = $chosen_cds->{complement}?-1:1; + $chosen_result->{in_cds} = 1; + $chosen_result->{cds_pos} = + ($chosen_cds->{complement}?$chosen_cds->{stop}-$param{pos}:$param{pos}-$chosen_cds->{start}) + + $chosen_cds->{position}; + $chosen_result->{gene} = $chosen_cds->{gene}; + $chosen_result->{pos} = $chosen_cds->{pos}; + $chosen_result->{number} = $chosen_cds->{number}; + $chosen_result->{variant} = $chosen_cds->{variant}; + $chosen_result->{acc} = $chosen_cds->{accession}; + $chosen_result->{accver} = $chosen_cds->{accession_ver}; + $chosen_result->{prot_pos} = sprintf("%.1f",$chosen_result->{cds_pos}/ 3); + $chosen_result->{phase} = ($chosen_cds->{phase} + $param{pos} - $chosen_cds->{start}) % 3; + print STDERR Data::Dumper->Dump([$chosen_cds,$chosen_result],[qw(chosen_cds chosen_result)]) if $DEBUG; + my $mrna_cds_rv = $param{mrna_cds_sth}->execute($chosen_cds->{gi}) or + die "Unable to execute statement properly: ".$param{dbh}->errstr; + my $mrna_cds_result = $param{mrna_cds_sth}->fetchrow_hashref(); + if (defined $mrna_cds_result and keys %{$mrna_cds_result}) { + print STDERR Data::Dumper->Dump([$mrna_cds_result,$chosen_result,\%param], + [qw(mrna_cds_result chosen_result *param)]) if $DEBUG; + my $codon = substr $mrna_cds_result->{sequence},$chosen_result->{cds_pos} - (($chosen_result->{cds_pos}) % 3) ,3; + my $orig_aa = convert_to_prot_seq($codon); + my $modified_codon = $codon; + substr($modified_codon,$chosen_result->{cds_pos} % 3,1)= + ($chosen_cds->{complement}?dna_complement($param{alt}):$param{alt}); + my $new_aa = convert_to_prot_seq($modified_codon); + print STDERR Dumper($mrna_cds_result,$chosen_result,$chosen_cds,{codon_pos => $chosen_result->{cds_pos} - ($chosen_result->{cds_pos} % 3), + codon=>$codon,modified_codon=>$modified_codon,pos=>$param{pos}}) if $DEBUG; + if ($DEBUG > 1) { + my $a = -1; + print STDERR map {$a++; ($a*3)." $_\n"} $mrna_cds_result->{sequence} =~ m/.../g; + } + $chosen_result->{orig_aa} = $orig_aa; + $chosen_result->{new_aa} = $new_aa; + $chosen_result->{new_codon} = $modified_codon; + $chosen_result->{orig_codon} = $codon; + $chosen_result->{nonsyn} = $new_aa ne $orig_aa ? 1 : 0; + } + } + } + } + return $chosen_result // {}; +} + +sub find_real_snp_id { + my %param = @_; + + my %info; + my $rv = $param{sth}->execute($param{id},$param{id}) // + die "Unable to execute statement properly: ".$param{dbh}->errstr; + my ($real_id) = map {ref $_ ?@{$_}:()} map {ref $_ ?@{$_}:()} $param{sth}->fetchall_arrayref([0]); + if ($param{sth}->err) { + print STDERR $param{sth}->errstr; + $param{sth}->finish; + return $param{id}; + } + $param{sth}->finish; + return $real_id // $param{id}; +} + +sub is_retired_snp { + my %param = @_; + + my %info; + my $rv = $param{sth}->execute($param{id}) // + die "Unable to execute statement properly: ".$param{dbh}->errstr; + if ($param{sth}->rows >= 1) { + return 1; + } + return 0; +} + + +sub fixup_chr { + my ($chr) = @_; + + $chr =~ s/^NC_00000?//; + $chr =~ s/^chr//; + if (looks_like_number($chr)) { + if ($chr == 23) { + $chr = "X" + } + elsif ($chr == 24) { + $chr = "Y" + } + } + return ($chr); +} + +sub allele_freq_information{ + my %param = @_; + + return {} if $param{id} !~ /^\d+$/; + + my $rv = $param{sth}->execute($param{id},$param{chr}) or + die "Unable to execute statement properly: ".$param{dbh}->errstr." id:".$param{id}." chr:".$param{chr}; + + my $results = $param{sth}->fetchall_arrayref({}); + if ($param{sth}->err) { + print STDERR $param{sth}->errstr; + return {}; + } + my %pop_alleles; + my %alleles; + my %seen_indiv; + my %indivs; + for my $result (@{$results}) { + next if exists $seen_indiv{$result->{submitted_ind_id}.'.'.$result->{chr_num}}; + $indivs{$result->{submitted_ind_id}}{pop_handle} = $result->{pop_handle}; + $indivs{$result->{submitted_ind_id}}{loc_pop_id} = $result->{loc_pop_id}; + $indivs{$result->{submitted_ind_id}}{'chr_'.$result->{chr_num}} = $result->{allele}; + if (defined $result->{loc_pop_id}) { + $pop_alleles{$result->{loc_pop_id}}{$result->{allele}}++; + $pop_alleles{$result->{loc_pop_id}}{total}++; + } + $alleles{$result->{allele}}++; + $alleles{total}++; + $seen_indiv{$result->{submitted_ind_id}.'.'.$result->{chr_num}} = 1; + } + my %genotypes; + my %pop_genotypes; + for my $indiv (values %indivs) { + if (defined $indiv->{chr_1} and defined $indiv->{chr_2}) { + my $genotype = join('/', + sort($indiv->{chr_1}, + $indiv->{chr_2})); + $pop_genotypes{$indiv->{loc_pop_id}}{$genotype}++; + $pop_genotypes{$indiv->{loc_pop_id}}{total}++; + $genotypes{$genotype}++; + $genotypes{total}++; + } + } + return {pop_alleles => \%pop_alleles, + pop_genotypes => \%pop_genotypes, + genotypes => \%genotypes, + alleles => \%alleles, + }; +} + +sub allele_freq{ + my ($freq_table,$allele) = @_; + if (defined $allele) { + return ((exists $freq_table->{$allele} and defined $freq_table->{$allele}) ? + $freq_table->{$allele}/$freq_table->{total} : 0); + } + my @alleles = grep {$_ ne 'total'} keys %{$freq_table}; + return join(',',map {$_.':'.(defined $freq_table->{$_} ? + $freq_table->{$_}/$freq_table->{total} : 0)} @alleles); +} + + +sub find_rsid { + my %param = @_; + + my $rv = $param{sth}->execute($param{chr},$param{pos}) // + die "Unable to execute statement properly: ".$param{dbh}->errstr; + my @snp_ids = grep {$_ =~ /^\d+$/} + map {ref $_ eq 'ARRAY'? @{$_}:$_} + map {ref $_ eq 'ARRAY'? @{$_}:$_} + $sth->fetchall_arrayref([0]); + if ($sth->err) { + print STDERR $sth->errstr; + return '?'; + } + if (not @snp_ids) { + return '.'; + } + return join(',',@snp_ids); +} + +sub dna_complement{ + my ($seq) = @_; + return join('',map {tr /ATCG/TAGC/; $_} reverse split //,$seq); +} + +sub convert_to_prot_seq{ + my ($mrna) = @_; + $mrna =~ s/U/T/g; + my $prot = ''; + for my $pos (0..(length($mrna) / 3 - 1)) { + my $codon = substr($mrna,$pos*3,3); + if (not exists $prot_table->{$codon}) { + # this isn't quite right, as we should really be handling + # ambiguities which do not result in different amino + # acids, but for the time being, do it this way. + $prot .= 'N'; + } + else { + $prot .= $prot_table->{$codon}; + } + } + return $prot; +} + + +__DATA__ +ATT I Ile Isoleucine +ATC I Ile Isoleucine +ATA I Ile Isoleucine +CTT L Leu Leucine +CTC L Leu Leucine +CTA L Leu Leucine +CTG L Leu Leucine +TTA L Leu Leucine +TTG L Leu Leucine +GTT V Val Valine +GTC V Val Valine +GTA V Val Valine +GTG V Val Valine +TTT F Phe Phenylalanine +TTC F Phe Phenylalanine +ATG M Met Methionine +TGT C Cys Cysteine +TGC C Cys Cysteine +GCT A Ala Alanine +GCC A Ala Alanine +GCA A Ala Alanine +GCG A Ala Alanine +GGT G Gly Glycine +GGC G Gly Glycine +GGA G Gly Glycine +GGG G Gly Glycine +CCT P Pro Proline +CCC P Pro Proline +CCA P Pro Proline +CCG P Pro Proline +ACT T Thr Threonine +ACC T Thr Threonine +ACA T Thr Threonine +ACG T Thr Threonine +TCT S Ser Serine +TCC S Ser Serine +TCA S Ser Serine +TCG S Ser Serine +AGT S Ser Serine +AGC S Ser Serine +TAT Y Tyr Tyrosine +TAC Y Tyr Tyrosine +TGG W Trp Tryptophan +CAA Q Gln Glutamine +CAG Q Gln Glutamine +AAT N Asn Asparagine +AAC N Asn Asparagine +CAT H His Histidine +CAC H His Histidine +GAA E Glu Glutamic acid +GAG E Glu Glutamic acid +GAT D Asp Aspartic acid +GAC D Asp Aspartic acid +AAA K Lys Lysine +AAG K Lys Lysine +CGT R Arg Arginine +CGC R Arg Arginine +CGA R Arg Arginine +CGG R Arg Arginine +AGA R Arg Arginine +AGG R Arg Arginine +TAA * * Stop codon +TAG * * Stop codon +TGA * * Stop codon