2 # snp_info outputs additional snp information is released under the
3 # terms of the GPL version 2, or any later version, at your option.
4 # See the file README and COPYING for more information.
5 # Copyright 2011 by Don Armstrong <don@donarmstrong.com>.
18 snp_info - output snp information
25 --debug, -d debugging level (Default 0)
26 --help, -h display this help
27 --man, -m display manual
33 =item B<--nearest-gene>
35 Return the name of the nearest gene if the SNP isn't in a gene
39 Debug verbosity. (Default 0)
43 Display brief usage information.
53 echo 'rs1782034' | snp_info;
61 use Scalar::Util qw(looks_like_number);
64 my %options = (debug => 0,
77 'nearest_gene|nearest-gene!',
78 'debug|d+','help|h|?','man|m');
80 pod2usage() if $options{help};
81 pod2usage({verbose=>2}) if $options{man};
83 $DEBUG = $options{debug};
87 # push @USAGE_ERRORS,"You must pass something";
90 pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
93 my $dbh = DBI->connect("dbi:Pg:service=snp",'','',{AutoCommit => 0}) or
94 die "Unable to connect to database: ".$DBI::errstr;
96 my $real_snp_sth = $dbh->prepare(<<'END') // die "Unable to prepare realsnp statement: ".$dbh->errstr;
97 SELECT DISTINCT rscurrent FROM rsmergearch WHERE rslow=? OR rshigh=?
100 my $sth_snp_history = $dbh->prepare(<<'END') // die "Unable to prepare snp_history statement: ".$dbh->errstr;
101 SELECT * FROM snphistory WHERE snp_id=?;
104 my $snp_chr_pos_sth = $dbh->prepare(<<"END") // die "Unable to prepare snpinfo statement: ".$dbh->errstr;
105 SELECT scpr.snp_id AS snp_id,
108 scl.phys_pos_from AS pos2,
111 uv.var_str AS var_str,
112 ruv.var_str AS rev_var_str,
114 ml.locus_symbol AS symbol,
115 gitn.gene_name AS description,
116 scl.aln_quality AS alignment_quality
118 JOIN b$options{build}_snpchrposonref_$options{refseq} scpr ON s.snp_id=scpr.snp_id
119 JOIN b$options{build}_snpcontigloc_$options{refseq} scl ON scpr.snp_id=scl.snp_id
120 JOIN b$options{build}_contiginfo_$options{refseq} ci ON scl.ctg_id = ci.ctg_id
121 LEFT OUTER JOIN b$options{build}_snpcontiglocusid_$options{refseq} ml ON s.snp_id=ml.snp_id
122 LEFT OUTER JOIN geneidtoname gitn ON ml.locus_id=gitn.gene_id
123 JOIN univariation uv ON s.univar_id=uv.univar_id
124 JOIN univariation ruv ON uv.rev_univar_id=ruv.univar_id
125 WHERE (ci.group_term LIKE 'GRCh%' OR ci.group_term='Primary_Assembly') AND s.snp_id=? ORDER BY scl.aln_quality;
128 my $sth = $dbh->prepare(<<"END") // die "Unable to prepare snpinfo statement: ".$dbh->errstr;
129 SELECT snp_id FROM b$options{build}_snpchrposonref_$options{refseq} WHERE chr=? AND pos=?;
132 my $sth_mrna_ss = $dbh->prepare(<<'END') // die "Unable to prepare mrna_ss statement: ".$dbh->errstr;
133 SELECT * FROM mrna WHERE start<=? AND stop>=? AND chr=? ORDER BY variant;
136 my $sth_cds_ss = $dbh->prepare(<<'END') // die "Unable to prepare cds_ss statement: ".$dbh->errstr;
137 SELECT * FROM cds WHERE start<=? AND stop>=? AND chr=? ORDER BY variant;
140 my $sth_mrna_cds = $dbh->prepare(<<'END') // die "Unable to prepare mrna_cds statement: ".$dbh->errstr;
141 SELECT * FROM mrna_cds_table WHERE cds_gi=?;
144 my $sth_allele_freq = $dbh->prepare(<<'END') // die "Unable to prepare allele_freq statement: ".$dbh->errstr;
145 SELECT ssl.snp_id AS snp_id,
146 si.subsnp_id AS subsnp_id,
148 si.submitted_ind_id AS submitted_ind_id,
149 ga.chr_num AS chr_num,
150 ga.rev_flag AS rev_flag,
152 FROM snpsubsnplink ssl
153 JOIN subind si on ssl.subsnp_id=si.subsnp_id
154 JOIN gtyallele ga on si.gty_id=ga.gty_id
155 JOIN allele a on ga.fwd_allele_id = a.allele_id
156 WHERE ssl.snp_id=? AND
157 si.chr=? AND ga.rev_flag = '0' AND
161 my $sth_rsid_info = $dbh->prepare(<<'END') // die "Unable to prepare rsid info statement: ".$dbh->errstr;
162 SELECT * FROM cds WHERE start<=? AND stop>=? AND chr=? ORDER BY variant;
165 my $sth_near_gene_start = $dbh->prepare(<<'END') // die "Unable to prepare near gene start statement: ".$dbh->errstr;
166 SELECT cds.start AS start,
169 gitn.gene_name AS description
171 LEFT OUTER JOIN geneidtoname gitn ON cds.geneid=gitn.gene_id
172 WHERE cds.chr = ? AND cds.start > ?
173 ORDER BY cds.start ASC LIMIT 1;
176 my $sth_near_gene_stop = $dbh->prepare(<<'END') // die "Unable to prepare near gene stop statement: ".$dbh->errstr;
177 SELECT cds.start AS start,
180 gitn.gene_name AS description
182 LEFT OUTER JOIN geneidtoname gitn ON cds.geneid=gitn.gene_id
183 WHERE cds.chr = ? AND cds.stop < ?
184 ORDER BY cds.stop DESC LIMIT 1;
191 my ($codon,$aa,$aa_med,$aa_long) = split /\t/;
192 $prot_table->{$codon} = $aa;
195 my @headers = qw(id chr pos ref alt);
196 my @row_keys_to_initialize = qw(gene nonsyn aa_ref aa_alt codon_ref codon_alt cdspos protpos loc num var acc phase reffreq altfreq refaltn);
197 print join("\t",@headers,'orig_id',@row_keys_to_initialize)."\n";
199 print $_ and next if /^#/;
201 my @row = split /\t/, $_;
203 @row{@headers} = @row[0..$#headers];
204 $row{orig_id} = $row{id};
205 # this is taken directly from vcf_fill_rsid; we want it for the time being anyway
206 if ($row{id} !~ /^(?:rs|ss)?\d+(?:,\s*(?:rs|ss)?\d+)*$/) {
207 my $new_id = find_rsid(chr => fixup_chr($row{chr}),
209 pos => $row{pos} - 1,
213 if ($new_id !~ /^(?:[\.\?])$/) {
214 $row{id} = "rs".$new_id;
217 if ($row{id} =~ /(rs|ss)(\d+)/) {
220 $id = find_real_snp_id(id => $id,
222 sth => $real_snp_sth,
224 if (not defined $id and
225 is_retired_snp(id => $id,
227 sth => $sth_snp_history,
229 if (defined $row{chr} and defined $row{pos}) {
230 $row{id} = "$row{chr}-$row{pos}";
234 $row{id} = "$type$id";
238 if (defined $row{id} and ($options{fix_pos} or
239 not defined $row{pos} or
240 not defined $row{chr} or
241 not defined $row{ref} or
242 not defined $row{alt})) {
243 my %info = find_chr_pos(id => [$row{id} =~ /(?:rs|ss)(\d+)/]->[0],
245 sth => $snp_chr_pos_sth,
247 for my $inf (qw(pos chr ref alt)) {
248 $row{$inf} = $info{$inf} if $options{fix_pos} and defined $info{$inf} or not defined $row{$inf};
250 # handle partially aligned snps
251 if ($options{fix_pos} and not defined $row{pos}) {
252 $row{pos} = $info{pos2};
254 if (not defined $row{pos} or not defined $row{chr}) {
255 if (defined $row{pos}) {
256 print STDERR "$row{id} is ambiguously placed\n";
259 print STDERR "Unable to find a position/chr for $row{id}\n";
263 my $mrna_information = mrna_information(dbh => $dbh,
265 # these are all 1 indexed
267 chr => fixup_chr($row{chr}),
270 cds_sth => $sth_cds_ss,
271 mrna_cds_sth => $sth_mrna_cds,
273 @row{@row_keys_to_initialize} = ('') x @row_keys_to_initialize;
274 if (keys %{$mrna_information}) {
275 # strip out existing information in $row{info}
277 $row{gene} = $mrna_information->{gene};
278 $row{nonsyn} = 1 if defined $mrna_information->{nonsyn} and $mrna_information->{nonsyn};
279 $row{aa_ref} = $mrna_information->{orig_aa} if defined $mrna_information->{orig_aa};
280 $row{aa_alt} = $mrna_information->{new_aa} if defined $mrna_information->{new_aa};
281 $row{codon_ref} = $mrna_information->{orig_codon} if defined $mrna_information->{orig_codon};
282 $row{codon_alt} = $mrna_information->{new_codon} if defined $mrna_information->{new_codon};
283 $row{cdspos} = $mrna_information->{cds_pos} if defined $mrna_information->{cds_pos};
284 $row{protpos} = $mrna_information->{prot_pos} if defined $mrna_information->{prot_pos};
285 $row{loc} = $mrna_information->{exon}?'exon':'intron' if defined $mrna_information->{gene};
286 $row{num} = $mrna_information->{number};
287 $row{var} = $mrna_information->{variant} if defined $mrna_information->{variant};
288 $row{acc} = $mrna_information->{accession};
289 $row{phase} = $mrna_information->{phase} if defined $mrna_information->{phase};
291 if ((not defined $row{gene} or not length $row{gene})
292 and $options{nearest_gene}) {
293 $row{gene} = find_nearest_gene(dbh => $dbh,
294 start_sth => $sth_near_gene_start,
295 stop_sth => $sth_near_gene_stop,
296 chr => fixup_chr($row{chr}),
300 my $allele_freq_information = allele_freq_information(dbh => $dbh,
301 sth => $sth_allele_freq,
302 id => [$row{id} =~ /(?:rs|ss)(\d+)/]->[0],
303 # these are all 1 indexed
305 chr => fixup_chr($row{chr}),
308 ) if defined $row{id} and $row{id} =~ /^(?:rs|ss)/;
309 if (keys %{$allele_freq_information}) {
310 # strip out existing information in $row{info}
311 if (exists $allele_freq_information->{alleles} and keys %{$allele_freq_information->{alleles}}) {
312 $row{reffreq} = sprintf('%.3f',allele_freq($allele_freq_information->{alleles},$row{ref}));
313 $row{altfreq} = sprintf('%.3f',allele_freq($allele_freq_information->{alleles},$row{alt}));
314 $row{refaltn} = $allele_freq_information->{alleles}{total};
317 print join("\t",map {defined $_?$_:'NA'} @row{@headers,'orig_id',@row_keys_to_initialize},$#row > $#headers?@row[@headers..$#row]:())."\n";
318 if ($options{verbose} > 0 and $. % 100 == 0) {
319 print STDERR "At line $.\n";
323 sub find_real_snp_id {
327 my $rv = $param{sth}->execute($param{id},$param{id}) //
328 die "Unable to execute statement properly: ".$param{dbh}->errstr;
329 my ($real_id) = map {ref $_ ?@{$_}:()} map {ref $_ ?@{$_}:()} $param{sth}->fetchall_arrayref([0]);
330 if ($param{sth}->err) {
331 print STDERR $param{sth}->errstr;
336 return $real_id // $param{id};
343 my $rv = $param{sth}->execute($param{id}) //
344 die "Unable to execute statement properly: ".$param{dbh}->errstr;
345 my $results = $param{sth}->fetchall_arrayref({});
346 if ($param{sth}->err) {
347 print STDERR $param{sth}->errstr;
351 for my $result (@{$results}) {
355 if ($info{chr} eq 'NotOn') {
359 $info{orien_var_str} = $info{rev_var_str};
362 $info{orien_var_str} = $info{var_str};
364 my @alleles = split qr{/},$info{orien_var_str};
365 ($info{alt}) = grep {$_ ne $info{ref}} @alleles;
373 sub find_nearest_gene {
375 return undef unless defined $param{chr} and defined $param{pos};
379 for my $st (qw(start stop)) {
380 my $rv = $param{"${st}_sth"}->execute($param{chr},$param{pos}) //
381 die "Unable to execute nearest gene $st statement properly: ".$param{dbh}->errstr;
382 my $results = $param{"${st}_sth"}->fetchall_arrayref({});
383 if ($param{"${st}_sth"}->err) {
384 print STDERR $param{"${st}_sth"}->errstr;
385 $param{"${st}_sth"}->finish;
388 $possible_genes{$st} = $results->[0]
390 for my $st (qw(start stop)) {
391 # opposite of start/stop (stop/start)
392 my $st_op = $st eq q(start) ? q(stop) : q(start);
393 if (not defined $possible_genes{$st}{start}) {
394 if (defined $possible_genes{$st_op}{start}) {
395 return $possible_genes{$st_op}{symbol}
403 # if the start of the start gene is closer to the position of the
404 # snp than the stop of the stop gene, it's the nearest gene;
405 # otherwise, the stop gene is
406 if ($possible_genes{start}{start} - $param{pos} <
407 $param{pos} - $possible_genes{stop}{stop}
409 return $possible_genes{start}{symbol};
412 return $possible_genes{stop}{symbol};
417 sub mrna_information{
420 my $rv = $param{sth}->execute($param{pos},$param{pos},$param{chr}) or
421 die "Unable to execute statement properly: ".$param{dbh}->errstr;
423 my $results = $param{sth}->fetchall_arrayref({});
424 if ($param{sth}->err) {
425 print STDERR $param{sth}->errstr;
429 for my $result (@{$results}) {
430 if (not defined $chosen_result) {
431 $chosen_result = $result;
434 if (not defined $chosen_result->{variant} or
435 (defined $result->{variant} and
436 $chosen_result->{variant} > $result->{variant})) {
437 $chosen_result = $result;
441 if (defined $chosen_result and $chosen_result->{exon}) {
442 # try to figure out the cds if there is mrna information and
444 my $cds_rv = $param{cds_sth}->execute($param{pos},$param{pos},$param{chr}) or
445 die "Unable to execute statement properly: ".$param{dbh}->errstr;
446 my $cds_results = $param{cds_sth}->fetchall_arrayref({});
447 if ($param{cds_sth}->err) {
448 print STDERR $param{cds_sth}->errstr;
451 print STDERR Data::Dumper->Dump([$cds_results],[qw(cds_results)]) if $DEBUG;
453 for my $cds_result (@{$cds_results}) {
454 if (not defined $chosen_cds) {
455 $chosen_cds = $cds_result;
458 if (not defined $cds_result->{variant} or
459 (defined $chosen_cds->{variant} and
460 $chosen_cds->{variant} ge $cds_result->{variant})) {
461 $chosen_cds = $cds_result;
465 if (defined $chosen_cds and keys %{$chosen_cds}) {
466 my $comp_offset = $chosen_cds->{complement}?-1:1;
467 $chosen_result->{in_cds} = 1;
468 $chosen_result->{cds_pos} =
469 ($chosen_cds->{complement}?$chosen_cds->{stop}-$param{pos}:$param{pos}-$chosen_cds->{start}) +
470 $chosen_cds->{position};
471 $chosen_result->{gene} = $chosen_cds->{gene};
472 $chosen_result->{pos} = $chosen_cds->{pos};
473 $chosen_result->{number} = $chosen_cds->{number};
474 $chosen_result->{variant} = $chosen_cds->{variant};
475 $chosen_result->{acc} = $chosen_cds->{accession};
476 $chosen_result->{prot_pos} = sprintf("%.1f",$chosen_result->{cds_pos}/ 3);
477 $chosen_result->{phase} = ($chosen_cds->{phase} + $param{pos} - $chosen_cds->{start}) % 3;
478 print STDERR Data::Dumper->Dump([$chosen_cds,$chosen_result],[qw(chosen_cds chosen_result)]) if $DEBUG;
479 my $mrna_cds_rv = $param{mrna_cds_sth}->execute($chosen_cds->{gi}) or
480 die "Unable to execute statement properly: ".$param{dbh}->errstr;
481 my $mrna_cds_result = $param{mrna_cds_sth}->fetchrow_hashref();
482 if (defined $mrna_cds_result and keys %{$mrna_cds_result}) {
483 print STDERR Data::Dumper->Dump([$mrna_cds_result,$chosen_result,\%param],
484 [qw(mrna_cds_result chosen_result *param)]) if $DEBUG;
485 my $codon = substr $mrna_cds_result->{sequence},$chosen_result->{cds_pos} - (($chosen_result->{cds_pos}) % 3) ,3;
486 my $orig_aa = convert_to_prot_seq($codon);
487 my $modified_codon = $codon;
488 substr($modified_codon,$chosen_result->{cds_pos} % 3,1)=
489 ($chosen_cds->{complement}?dna_complement($param{alt}):$param{alt});
490 my $new_aa = convert_to_prot_seq($modified_codon);
491 print STDERR Dumper($mrna_cds_result,$chosen_result,$chosen_cds,{codon_pos => $chosen_result->{cds_pos} - ($chosen_result->{cds_pos} % 3),
492 codon=>$codon,modified_codon=>$modified_codon,pos=>$param{pos}}) if $DEBUG;
495 print STDERR map {$a++; ($a*3)." $_\n"} $mrna_cds_result->{sequence} =~ m/.../g;
497 $chosen_result->{orig_aa} = $orig_aa;
498 $chosen_result->{new_aa} = $new_aa;
499 $chosen_result->{new_codon} = $modified_codon;
500 $chosen_result->{orig_codon} = $codon;
501 $chosen_result->{nonsyn} = $new_aa ne $orig_aa ? 1 : 0;
506 return $chosen_result // {};
512 return $chr if not defined $chr;
514 $chr =~ s/^NC_00000?//;
516 if (looks_like_number($chr)) {
527 sub allele_freq_information{
530 return {} if $param{id} !~ /^\d+$/;
532 my $rv = $param{sth}->execute($param{id},$param{chr}) or
533 die "Unable to execute statement properly: ".$param{dbh}->errstr." id:".$param{id}." chr:".$param{chr};
535 my $results = $param{sth}->fetchall_arrayref({});
536 if ($param{sth}->err) {
537 print STDERR $param{sth}->errstr;
544 for my $result (@{$results}) {
545 next if exists $seen_indiv{$result->{submitted_ind_id}.'.'.$result->{chr_num}};
546 $indivs{$result->{submitted_ind_id}}{'chr_'.$result->{chr_num}} = $result->{allele};
547 if (defined $result->{loc_pop_id}) {
548 $indivs{$result->{submitted_ind_id}}{loc_pop_id} = $result->{loc_pop_id};
549 $indivs{$result->{submitted_ind_id}}{pop_handle} = $result->{pop_handle};
550 $pop_alleles{$result->{loc_pop_id}}{$result->{allele}}++;
551 $pop_alleles{$result->{loc_pop_id}}{total}++;
553 $alleles{$result->{allele}}++;
555 $seen_indiv{$result->{submitted_ind_id}.'.'.$result->{chr_num}} = 1;
559 for my $indiv (values %indivs) {
560 if (defined $indiv->{chr_1} and defined $indiv->{chr_2}) {
561 my $genotype = join('/',
562 sort($indiv->{chr_1},
564 if (defined $indiv->{loc_pop_id}) {
565 $pop_genotypes{$indiv->{loc_pop_id}}{$genotype}++;
566 $pop_genotypes{$indiv->{loc_pop_id}}{total}++;
568 $genotypes{$genotype}++;
572 return {pop_alleles => \%pop_alleles,
573 pop_genotypes => \%pop_genotypes,
574 genotypes => \%genotypes,
575 alleles => \%alleles,
580 my ($freq_table,$allele) = @_;
581 my $flipped_allele = $allele;
582 $flipped_allele =~ tr/ACTG/TGAC/;
583 if (defined $allele) {
584 if (exists $freq_table->{$allele} and defined $freq_table->{$allele}) {
585 return $freq_table->{$allele}/$freq_table->{total};
587 elsif (exists $freq_table->{$flipped_allele} and defined $freq_table->{$flipped_allele}) {
588 return $freq_table->{$flipped_allele}/$freq_table->{total};
592 my @alleles = grep {$_ ne 'total'} keys %{$freq_table};
593 return join(',',map {$_.':'.(defined $freq_table->{$_} ?
594 $freq_table->{$_}/$freq_table->{total} : 0)} @alleles);
601 my $rv = $param{sth}->execute($param{chr},$param{pos}) //
602 die "Unable to execute statement properly: ".$param{dbh}->errstr;
603 my @snp_ids = grep {$_ =~ /^\d+$/}
604 map {ref $_ eq 'ARRAY'? @{$_}:$_}
605 map {ref $_ eq 'ARRAY'? @{$_}:$_}
606 $sth->fetchall_arrayref([0]);
608 print STDERR $sth->errstr;
614 return join(',',@snp_ids);
619 return join('',map {tr /ATCG/TAGC/; $_} reverse split //,$seq);
622 sub convert_to_prot_seq{
626 for my $pos (0..(length($mrna) / 3 - 1)) {
627 my $codon = substr($mrna,$pos*3,3);
628 if (not exists $prot_table->{$codon}) {
629 # this isn't quite right, as we should really be handling
630 # ambiguities which do not result in different amino
631 # acids, but for the time being, do it this way.
635 $prot .= $prot_table->{$codon};
656 TTT F Phe Phenylalanine
657 TTC F Phe Phenylalanine
692 GAA E Glu Glutamic acid
693 GAG E Glu Glutamic acid
694 GAT D Asp Aspartic acid
695 GAC D Asp Aspartic acid