--- /dev/null
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+
+# psql snp -c "COPY mrna_cds_table FROM STDIN WITH DELIMITER ' ' NULL AS 'NULL'";
+
+my $prot_table;
+
+while (<DATA>) {
+ chomp;
+ my ($codon,$aa,$aa_med,$aa_long) = split /\t/;
+ $prot_table->{$codon} = $aa;
+}
+
+my $current_stanza;
+while (<>) {
+ $current_stanza .= $_;
+ if ($_ =~ m{^//$}) {
+ handle_mrna_stanza($current_stanza);
+ $current_stanza = '';
+ }
+}
+
+sub handle_mrna_stanza{
+ my ($current_stanza) = @_;
+
+ my ($accession,$version,$gi) =
+ $current_stanza =~ m/^VERSION\s+([^\.]+)\.(\d+)\s+GI:(\d+)$/m;
+ my ($gene) =
+ $current_stanza =~ m/^\s+\/gene=\"?([^"]+)\"?$/m;
+ my ($cds) =
+ $current_stanza =~ m/^\s+CDS\s+(\d+(?:\.\.\d+))$/m;
+ return if not defined $cds;
+ my ($start,$stop) = split /\.\./,$cds;
+ return if not defined $start;
+ if (not defined $stop) {
+ $stop = $start;
+ }
+ my ($sequence) =
+ $current_stanza =~ m/^(ORIGIN.+)$/sm;
+ $sequence =~ s{\s*//\s*$}{};
+ $sequence =~ s{^ORIGIN}{};
+ $sequence =~ s{[\d\s\n]+}{}g;
+ $sequence = uc($sequence);
+ my $sequence2 = substr $sequence,($start-1),($stop-$start+1);
+ # There are other posibilities, like CTG, ACG, etc; so we can't test this.
+ # if ($sequence2 !~ /^A[UT]G/i) {
+ # print STDERR $sequence2."\n";
+ # print STDERR $gene."\n";
+ # die "Sequence doesn't start with a start codon";
+ # }
+ if ($sequence2 !~ /(?:[TU]AA|[TU]AG|[TU]GA)$/) {
+ print STDERR $sequence2."\n";
+ print STDERR $gene."\n";
+ die "Sequence doesn't end with a stop codon";
+ }
+ # figure out cds gi
+ my $cds_gi='NULL';
+ my ($cds_stanza) = $current_stanza =~ /^\s{5}CDS\s+(.+?)^(?:\s{5}\w+|ORIGIN)/ms;
+ if (length $cds_stanza) {
+ ($cds_gi) = $cds_stanza =~ /db_xref="GI:(\d+)"/;
+ if (not defined $cds_gi) {
+ $cds_gi = 'NULL';
+ }
+ }
+ print join("\t",$gi,$cds_gi,$accession,$version,$gene,$start,$stop,length($sequence2), uc($sequence2),convert_to_prot_seq($sequence2))."\n";
+}
+
+sub convert_to_prot_seq{
+ my ($mrna) = @_;
+$mrna =~ s/U/T/g;
+ my $prot = '';
+ for my $pos (0..(length($mrna) / 3 - 1)) {
+ my $codon = substr($mrna,$pos*3,3);
+ if (not exists $prot_table->{$codon}) {
+ $prot .= 'N';
+ }
+ else {
+ $prot .= $prot_table->{$codon};
+ }
+ }
+ return $prot;
+}
+
+__DATA__
+ATT I Ile Isoleucine
+ATC I Ile Isoleucine
+ATA I Ile Isoleucine
+CTT L Leu Leucine
+CTC L Leu Leucine
+CTA L Leu Leucine
+CTG L Leu Leucine
+TTA L Leu Leucine
+TTG L Leu Leucine
+GTT V Val Valine
+GTC V Val Valine
+GTA V Val Valine
+GTG V Val Valine
+TTT F Phe Phenylalanine
+TTC F Phe Phenylalanine
+ATG M Met Methionine
+TGT C Cys Cysteine
+TGC C Cys Cysteine
+GCT A Ala Alanine
+GCC A Ala Alanine
+GCA A Ala Alanine
+GCG A Ala Alanine
+GGT G Gly Glycine
+GGC G Gly Glycine
+GGA G Gly Glycine
+GGG G Gly Glycine
+CCT P Pro Proline
+CCC P Pro Proline
+CCA P Pro Proline
+CCG P Pro Proline
+ACT T Thr Threonine
+ACC T Thr Threonine
+ACA T Thr Threonine
+ACG T Thr Threonine
+TCT S Ser Serine
+TCC S Ser Serine
+TCA S Ser Serine
+TCG S Ser Serine
+AGT S Ser Serine
+AGC S Ser Serine
+TAT Y Tyr Tyrosine
+TAC Y Tyr Tyrosine
+TGG W Trp Tryptophan
+CAA Q Gln Glutamine
+CAG Q Gln Glutamine
+AAT N Asn Asparagine
+AAC N Asn Asparagine
+CAT H His Histidine
+CAC H His Histidine
+GAA E Glu Glutamic acid
+GAG E Glu Glutamic acid
+GAT D Asp Aspartic acid
+GAC D Asp Aspartic acid
+AAA K Lys Lysine
+AAG K Lys Lysine
+CGT R Arg Arginine
+CGC R Arg Arginine
+CGA R Arg Arginine
+CGG R Arg Arginine
+AGA R Arg Arginine
+AGG R Arg Arginine
+TAA * * Stop codon
+TAG * * Stop codon
+TGA * * Stop codon
--- /dev/null
+#!/usr/bin/perl
+# intron_exon_loader makes loadable mrna and cds files, and is
+# released under the terms of the GPL version 2, or any later version,
+# at your option. See the file README and COPYING for more
+# information.
+# Copyright 2011 by Don Armstrong <don@donarmstrong.com>.
+
+use warnings;
+use strict;
+
+use Getopt::Long;
+use Pod::Usage;
+
+=head1 NAME
+
+intron_exon_loader - Make SQL loadable mrna and cds files
+
+=head1 SYNOPSIS
+
+ [options]
+
+ Options:
+ --debug, -d debugging level (Default 0)
+ --help, -h display this help
+ --man, -m display manual
+
+=head1 OPTIONS
+
+=over
+
+=item B<--debug, -d>
+
+Debug verbosity. (Default 0)
+
+=item B<--help, -h>
+
+Display brief usage information.
+
+=item B<--man, -m>
+
+Display this manual.
+
+=back
+
+=head1 EXAMPLES
+
+ zcat CHR_*/hs_ref_GRCh37.p2_*.gbs.gz | \
+ intron_exon_loader| \
+ psql snp -c "COPY mrna FROM STDIN WITH DELIMITER ' ' NULL AS 'NULL'";
+
+ zcat CHR_*/hs_ref_GRCh37.p2_*.gbs.gz | \
+ intron_exon_loader --output cds| \
+ psql snp -c "COPY cds FROM STDIN WITH DELIMITER ' ' NULL AS 'NULL'";
+
+
+=cut
+
+use vars qw($DEBUG);
+
+my %options = (type => 'mrna',
+ debug => 0,
+ help => 0,
+ man => 0,
+ );
+
+GetOptions(\%options,
+ 'type|t=s',
+ 'debug|d+','help|h|?','man|m');
+
+pod2usage() if $options{help};
+pod2usage({verbose=>2}) if $options{man};
+
+$DEBUG = $options{debug};
+
+my @USAGE_ERRORS;
+if ($options{type} !~ /^(mrna|cds)$/) {
+ push @USAGE_ERRORS, "--type must be one of mrna or cds, not '$options{type}'";
+}
+pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
+
+use DBI;
+use Data::Dumper;
+
+my $dbh = DBI->connect("dbi:Pg:dbname=snp",'','',{AutoCommit => 0}) or
+ die "Unable to connect to database".$DBI::errstr;
+
+my $sth = $dbh->prepare(<<'END') // die "Unable to prepare snpinfo statement: ".$dbh->errstr;
+SELECT * FROM contigs where gi=?;
+END
+
+sub main {
+ my ($options) = @_;
+ my %current_contig;
+ my $in_stanza = 0;
+ my $skip_until_next_contig;
+ my $current_stanza = '';
+ my $type = lc($options{type});
+ my $stanza_type = $type eq 'mrna' ? 'mRNA' : 'CDS';
+ while (<>) {
+ chomp;
+ if (/^VERSION/) {
+ $skip_until_next_contig = 0;
+ my ($acc,$ver,$gi) = $_ =~ m/^VERSION\s+([A-Z0-9_]+)\.(\d+)\s+GI\:(\d+)/;
+ if (not defined $gi) {
+ die "No GI in '$_' on line $.";
+ }
+ %current_contig = (acc => $acc,
+ ver => $ver,
+ gi => $gi,
+ );
+ %current_contig = contig_information($dbh,$sth,%current_contig);
+ if (not defined $current_contig{chr_start}) {
+ $skip_until_next_contig = 1;
+ }
+ }
+ next if $skip_until_next_contig;
+ if (/^\s{5}(\S+)/) {
+ if ($1 eq $stanza_type) {
+ if (length $current_stanza and $in_stanza) {
+ output_current_stanza($type,$current_stanza,\%current_contig);
+ $current_stanza = '';
+ $in_stanza = 0;
+ }
+ $in_stanza = 1;
+ s/^\s{5}\S+//;
+ s/^\s+//;
+ $current_stanza = $_."\n";
+ } elsif (length $current_stanza) {
+ output_current_stanza($type,$current_stanza,\%current_contig);
+ $current_stanza = '';
+ $in_stanza = 0;
+ } else {
+ $in_stanza = 0;
+ }
+ }
+ elsif (/^[A-Z]/ and length $current_stanza) {
+ output_current_stanza($type,$current_stanza,\%current_contig);
+ $current_stanza = '';
+ $in_stanza = 0;
+ } else {
+ if ($in_stanza) {
+ $current_stanza .= $_."\n";
+ }
+ }
+ }
+}
+
+main();
+
+sub contig_information{
+ my ($dbh,$sth,%current_contig) = @_;
+
+ my $rv = $sth->execute($current_contig{gi}) or
+ die "Unable to execute statement properly: ".$dbh->errstr;
+
+ my $results = $sth->fetchall_hashref('gi');
+ if ($sth->err) {
+ print STDERR $sth->errstr;
+ }
+ if (exists $results->{$current_contig{gi}}) {
+ %current_contig = (%current_contig,
+ %{$results->{$current_contig{gi}}},
+ );
+ }
+ return %current_contig;
+}
+
+sub output_current_stanza{
+ my ($type,$current_stanza,$current_contig) = @_;
+ $current_stanza =~ s/^[\s\t]+//mg;
+ $current_stanza =~ s/\n\s*([^\/])/ $1/mg;
+ my ($extent,@other_fields) = split /\n/,$current_stanza;
+ $extent =~ s/\s+//g;
+ # print "extent: $extent\n";
+ # figure out extent
+ my $complement = $extent =~ m/complement/;
+ $extent =~ s/complement\((.+)\)$/$1/;
+ # strip out the join, because we don't care about it
+ $extent =~ s/join\((.+)\)$/$1/;
+ # handle fields
+ my %fields = map {my ($key,$value) = $_ =~ m{\/([^=]+)="?(.+?)"?$}s;
+ (not defined $key)?():(($key eq 'db_xref' && $value =~ m/([^:]+):(.+)/)?(lc($1),$2):(lc($key),$value))} @other_fields;
+ return if not exists $fields{product};
+ if (exists $fields{transcript_id}) {
+ ($fields{accession},$fields{accession_ver}) = $fields{transcript_id} =~ m/(.+)\.(\d+)/;
+ }
+ elsif (exists $fields{protein_id}) {
+ ($fields{accession},$fields{accession_ver}) = $fields{protein_id} =~ m/(.+)\.(\d+)/;
+ }
+ else {
+ return;
+ }
+ my ($variant) = $fields{product} =~ m/variant\s*(\d+)/i;
+ if (not defined $variant or not length $variant) {
+ ($variant) = $fields{product} =~ m/isoform\s*([^\s]+)/;
+ }
+ if (not defined $variant or not length $variant) { # or $variant !~ /^\d+$/) {
+ $variant = 'NULL';
+ }
+ my @extents = split /\s*\,\s*/,$extent;
+ if ($type eq 'mrna') {
+ output_current_mrna(\@extents,$variant,$complement,\%fields,$current_contig);
+ }
+ else {
+ output_current_cds(\@extents,$variant,$complement,\%fields,$current_contig);
+ }
+}
+
+sub output_current_mrna{
+ my ($extents,$variant,$complement,$fields,$current_contig) = @_;
+ # parse this out
+# gi is 224514618 (start 10001, stop 267719)
+# complement(join(126698..126805,126953..129696,
+# 129790..129847,130075..130566))
+# /gene="LOC729737"
+# /product="hypothetical protein LOC729737"
+# /note="Derived by automated computational analysis using
+# gene prediction method: GNOMON. Supporting evidence
+# includes similarity to: 2 mRNAs, 34 ESTs, 1 Protein"
+# /transcript_id="AM_001133863.3"
+# /db_xref="GI:310118477"
+ # /db_xref="Aeneid:729737"
+ my @output_fields = (@{$fields}{qw(gi accession accession_ver)},
+ $variant,
+ $complement ? 'TRUE':'FALSE',
+ @{$fields}{qw(gene geneid)},
+ @{$current_contig}{qw(gi chr)},
+ );
+ my $previous_stop = undef;
+ my $previous_start = undef;
+ my $number = 1;
+ my @exons = @{$extents};
+ if ($complement) {
+ @exons = reverse @exons;
+ }
+ for my $exon (@exons) {
+ # strip off non-numerics like '>' and '<' here,
+ # because we don't want to deal with them
+ my ($start,$stop) = map {m/(\d+)/; $1;} split /\.\./,$exon;
+ if (not defined $stop) {
+ $stop = $start;
+ }
+ if (defined $previous_stop and $previous_stop > 0 and
+ defined $previous_start and $previous_start > 0
+ ) {
+ my $intron_start = $previous_stop+1;
+ my $intron_stop = $start-1;
+ if ($complement) {
+ $intron_start = $stop+1;
+ $intron_stop = $previous_start-1;
+ }
+ print join("\t",
+ @output_fields,
+ 'FALSE',
+ $number-1,
+ $current_contig->{chr_start}+$intron_start-1,
+ $current_contig->{chr_start}+$intron_stop-1,
+ $intron_start,
+ $intron_stop,
+ )."\n";
+ }
+ print join("\t",
+ @output_fields,
+ 'TRUE',
+ $number,
+ $current_contig->{chr_start}+$start-1,
+ $current_contig->{chr_start}+$stop-1,
+ $start,
+ $stop,
+ )."\n";
+ $number++;
+ $previous_stop = $stop;
+ $previous_start = $start;
+ }
+}
+
+
+sub output_current_cds {
+ my ($extents,$variant,$complement,$fields,$current_contig) = @_;
+ # parse this out
+# gi is 224514618 (start 10001, stop 267719)
+# complement(join(126698..126805,126953..129696,
+# 129790..129847,130075..130566))
+# /gene="LOC729737"
+# /product="hypothetical protein LOC729737"
+# /note="Derived by automated computational analysis using
+# gene prediction method: GNOMON. Supporting evidence
+# includes similarity to: 2 mRNAs, 34 ESTs, 1 Protein"
+# /transcript_id="AM_001133863.3"
+# /db_xref="GI:310118477"
+ # /db_xref="Aeneid:729737"
+ if (not exists $fields->{protein_id}) {
+ return;
+ }
+ my @output_fields = (@{$fields}{qw(gi accession accession_ver)},
+ $variant,
+ $complement ? 'TRUE':'FALSE',
+ @{$fields}{qw(gene geneid)},
+ @{$current_contig}{qw(gi chr)},
+ );
+ my $previous_stop = undef;
+ my $previous_start = undef;
+ my $number = 1;
+ my @exons = @{$extents};
+ if ($complement) {
+ @exons = reverse @exons;
+ }
+ my $position = 0;
+ my $current_phase = 0;
+ for my $exon (@exons) {
+ # strip off non-numerics like '>' and '<' here,
+ # because we don't want to deal with them
+ my ($start,$stop) = map {m/(\d+)/; $1;} split /\.\./,$exon;
+ if (not defined $stop) {
+ $stop = $start;
+ }
+ print join("\t",
+ @output_fields,
+ $number,
+ $current_phase,
+ $current_contig->{chr_start}+$start-1,
+ $current_contig->{chr_start}+$stop-1,
+ $start,
+ $stop,
+ $position,
+ )."\n";
+ $current_phase = ($current_phase + $stop - $start + 1) % 3;
+ $number++;
+ $previous_stop = $stop;
+ $previous_start = $start;
+ $position += abs($start-$stop)+1;
+ }
+}