--- /dev/null
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+
+# psql snp -c "COPY mrna_cds_table FROM STDIN WITH DELIMITER ' ' NULL AS 'NULL'";
+
+my $prot_table;
+
+while (<DATA>) {
+ chomp;
+ my ($codon,$aa,$aa_med,$aa_long) = split /\t/;
+ $prot_table->{$codon} = $aa;
+}
+
+my $current_stanza;
+while (<>) {
+ $current_stanza .= $_;
+ if ($_ =~ m{^//$}) {
+ handle_mrna_stanza($current_stanza);
+ $current_stanza = '';
+ }
+}
+
+sub handle_mrna_stanza{
+ my ($current_stanza) = @_;
+
+ my ($accession,$version,$gi) =
+ $current_stanza =~ m/^VERSION\s+([^\.]+)\.(\d+)\s+GI:(\d+)$/m;
+ my ($gene) =
+ $current_stanza =~ m/^\s+\/gene=\"?([^"]+)\"?$/m;
+ my ($cds) =
+ $current_stanza =~ m/^\s+CDS\s+(\d+(?:\.\.\d+))$/m;
+ return if not defined $cds;
+ my ($start,$stop) = split /\.\./,$cds;
+ return if not defined $start;
+ if (not defined $stop) {
+ $stop = $start;
+ }
+ my ($sequence) =
+ $current_stanza =~ m/^(ORIGIN.+)$/sm;
+ $sequence =~ s{\s*//\s*$}{};
+ $sequence =~ s{^ORIGIN}{};
+ $sequence =~ s{[\d\s\n]+}{}g;
+ $sequence = uc($sequence);
+ my $sequence2 = substr $sequence,($start-1),($stop-$start+1);
+ # There are other posibilities, like CTG, ACG, etc; so we can't test this.
+ # if ($sequence2 !~ /^A[UT]G/i) {
+ # print STDERR $sequence2."\n";
+ # print STDERR $gene."\n";
+ # die "Sequence doesn't start with a start codon";
+ # }
+ if ($sequence2 !~ /(?:[TU]AA|[TU]AG|[TU]GA)$/) {
+ print STDERR $sequence2."\n";
+ print STDERR $gene."\n";
+ die "Sequence doesn't end with a stop codon";
+ }
+ # figure out cds gi
+ my $cds_gi='NULL';
+ my ($cds_stanza) = $current_stanza =~ /^\s{5}CDS\s+(.+?)^(?:\s{5}\w+|ORIGIN)/ms;
+ if (length $cds_stanza) {
+ ($cds_gi) = $cds_stanza =~ /db_xref="GI:(\d+)"/;
+ if (not defined $cds_gi) {
+ $cds_gi = 'NULL';
+ }
+ }
+ print join("\t",$gi,$cds_gi,$accession,$version,$gene,$start,$stop,length($sequence2), uc($sequence2),convert_to_prot_seq($sequence2))."\n";
+}
+
+sub convert_to_prot_seq{
+ my ($mrna) = @_;
+$mrna =~ s/U/T/g;
+ my $prot = '';
+ for my $pos (0..(length($mrna) / 3 - 1)) {
+ my $codon = substr($mrna,$pos*3,3);
+ if (not exists $prot_table->{$codon}) {
+ $prot .= 'N';
+ }
+ else {
+ $prot .= $prot_table->{$codon};
+ }
+ }
+ return $prot;
+}
+
+__DATA__
+ATT I Ile Isoleucine
+ATC I Ile Isoleucine
+ATA I Ile Isoleucine
+CTT L Leu Leucine
+CTC L Leu Leucine
+CTA L Leu Leucine
+CTG L Leu Leucine
+TTA L Leu Leucine
+TTG L Leu Leucine
+GTT V Val Valine
+GTC V Val Valine
+GTA V Val Valine
+GTG V Val Valine
+TTT F Phe Phenylalanine
+TTC F Phe Phenylalanine
+ATG M Met Methionine
+TGT C Cys Cysteine
+TGC C Cys Cysteine
+GCT A Ala Alanine
+GCC A Ala Alanine
+GCA A Ala Alanine
+GCG A Ala Alanine
+GGT G Gly Glycine
+GGC G Gly Glycine
+GGA G Gly Glycine
+GGG G Gly Glycine
+CCT P Pro Proline
+CCC P Pro Proline
+CCA P Pro Proline
+CCG P Pro Proline
+ACT T Thr Threonine
+ACC T Thr Threonine
+ACA T Thr Threonine
+ACG T Thr Threonine
+TCT S Ser Serine
+TCC S Ser Serine
+TCA S Ser Serine
+TCG S Ser Serine
+AGT S Ser Serine
+AGC S Ser Serine
+TAT Y Tyr Tyrosine
+TAC Y Tyr Tyrosine
+TGG W Trp Tryptophan
+CAA Q Gln Glutamine
+CAG Q Gln Glutamine
+AAT N Asn Asparagine
+AAC N Asn Asparagine
+CAT H His Histidine
+CAC H His Histidine
+GAA E Glu Glutamic acid
+GAG E Glu Glutamic acid
+GAT D Asp Aspartic acid
+GAC D Asp Aspartic acid
+AAA K Lys Lysine
+AAG K Lys Lysine
+CGT R Arg Arginine
+CGC R Arg Arginine
+CGA R Arg Arginine
+CGG R Arg Arginine
+AGA R Arg Arginine
+AGG R Arg Arginine
+TAA * * Stop codon
+TAG * * Stop codon
+TGA * * Stop codon