6 # psql snp -c "COPY mrna_cds_table FROM STDIN WITH DELIMITER ' ' NULL AS 'NULL'";
12 my ($codon,$aa,$aa_med,$aa_long) = split /\t/;
13 $prot_table->{$codon} = $aa;
18 $current_stanza .= $_;
20 handle_mrna_stanza($current_stanza);
25 sub handle_mrna_stanza{
26 my ($current_stanza) = @_;
28 my ($accession,$version,$gi) =
29 $current_stanza =~ m/^VERSION\s+([^\.]+)\.(\d+)\s+GI:(\d+)$/m;
31 $current_stanza =~ m/^\s+\/gene=\"?([^"]+)\"?$/m;
33 $current_stanza =~ m/^\s+CDS\s+(\d+(?:\.\.\d+))$/m;
34 return if not defined $cds;
35 my ($start,$stop) = split /\.\./,$cds;
36 return if not defined $start;
37 if (not defined $stop) {
41 $current_stanza =~ m/^(ORIGIN.+)$/sm;
42 $sequence =~ s{\s*//\s*$}{};
43 $sequence =~ s{^ORIGIN}{};
44 $sequence =~ s{[\d\s\n]+}{}g;
45 $sequence = uc($sequence);
46 my $sequence2 = substr $sequence,($start-1),($stop-$start+1);
47 # There are other posibilities, like CTG, ACG, etc; so we can't test this.
48 # if ($sequence2 !~ /^A[UT]G/i) {
49 # print STDERR $sequence2."\n";
50 # print STDERR $gene."\n";
51 # die "Sequence doesn't start with a start codon";
53 if ($sequence2 !~ /(?:[TU]AA|[TU]AG|[TU]GA)$/) {
54 print STDERR $sequence2."\n";
55 print STDERR $gene."\n";
56 die "Sequence doesn't end with a stop codon";
60 my ($cds_stanza) = $current_stanza =~ /^\s{5}CDS\s+(.+?)^(?:\s{5}\w+|ORIGIN)/ms;
61 if (length $cds_stanza) {
62 ($cds_gi) = $cds_stanza =~ /db_xref="GI:(\d+)"/;
63 if (not defined $cds_gi) {
67 print join("\t",$gi,$cds_gi,$accession,$version,$gene,$start,$stop,length($sequence2), uc($sequence2),convert_to_prot_seq($sequence2))."\n";
70 sub convert_to_prot_seq{
74 for my $pos (0..(length($mrna) / 3 - 1)) {
75 my $codon = substr($mrna,$pos*3,3);
76 if (not exists $prot_table->{$codon}) {
80 $prot .= $prot_table->{$codon};
100 TTT F Phe Phenylalanine
101 TTC F Phe Phenylalanine
136 GAA E Glu Glutamic acid
137 GAG E Glu Glutamic acid
138 GAT D Asp Aspartic acid
139 GAC D Asp Aspartic acid