]> git.donarmstrong.com Git - dbsnp.git/commitdiff
add intron/exon schema and mrna cds loader
authorDon Armstrong <don@donarmstrong.com>
Tue, 13 Dec 2011 22:31:41 +0000 (14:31 -0800)
committerDon Armstrong <don@donarmstrong.com>
Tue, 13 Dec 2011 22:31:41 +0000 (14:31 -0800)
schema/extra_schema/intron_exon_schema.sql [new file with mode: 0644]
schema/extra_schema/intron_exon_schema_indexes.sql [new file with mode: 0644]
utils/human_mrna_cds_insert.pl [new file with mode: 0755]
utils/intron_exon_loader.pl [new file with mode: 0755]

diff --git a/schema/extra_schema/intron_exon_schema.sql b/schema/extra_schema/intron_exon_schema.sql
new file mode 100644 (file)
index 0000000..e1aed7f
--- /dev/null
@@ -0,0 +1,56 @@
+CREATE TABLE contigs (
+       tax_id int NOT NULL,
+       chr VARCHAR(127) NOT NULL,
+       chr_start INT NOT NULL,
+       chr_stop INT NOT NULL,
+       orientation VARCHAR(1) NOT NULL DEFAULT '+',
+       feature_name VARCHAR(127) NOT NULL,
+       gi INT,
+       feature_type VARCHAR(127) NOT NULL
+);
+
+CREATE TABLE exons (
+       gi int NOT NULL,
+       accession VARCHAR(127) NOT NULL,
+       accession_ver INT NOT NULL,
+       number INT NOT NULL,
+       start INT NOT NULL,
+       stop INT NOT NULL       
+       );
+
+CREATE TABLE cds (
+       gi int NOT NULL,
+       accession VARCHAR(127) NOT NULL,
+       accession_ver INT NOT NULL,
+       variant VARCHAR(127),
+       complement BOOLEAN DEFAULT FALSE,
+       gene VARCHAR (127),
+       geneid INT NOT NULL,
+       contig_gi INT NOT NULL,
+       chr VARCHAR (127),
+       number INT NOT NULL,
+       phase INT NOT NULL,
+       start INT NOT NULL,
+       stop INT NOT NULL,
+       contig_start INT NOT NULL,      
+       contig_stop INT NOT NULL,
+       position INT NOT NULL);
+
+CREATE TABLE mrna (
+       gi int NOT NULL,
+       accession VARCHAR(127) NOT NULL,
+       accession_ver INT NOT NULL,
+       variant INT,
+       complement BOOLEAN DEFAULT FALSE,
+       gene VARCHAR (127),
+       geneid INT NOT NULL,
+       contig_gi INT NOT NULL,
+       chr VARCHAR (127),
+       exon BOOLEAN DEFAULT TRUE,
+       number INT NOT NULL,
+       start INT NOT NULL,
+       stop INT NOT NULL,
+       contig_start INT NOT NULL,      
+       contig_stop INT NOT NULL);
+
+
diff --git a/schema/extra_schema/intron_exon_schema_indexes.sql b/schema/extra_schema/intron_exon_schema_indexes.sql
new file mode 100644 (file)
index 0000000..dd2ba09
--- /dev/null
@@ -0,0 +1,30 @@
+CREATE UNIQUE INDEX ON contigs(gi);
+CREATE INDEX ON contigs(feature_name);
+CREATE INDEX ON contigs(chr);
+
+CREATE UNIQUE INDEX ON exons(gi,number);
+CREATE UNIQUE INDEX ON exons(accession,accession_ver,number);
+CREATE INDEX ON exons(accession);
+
+CREATE UNIQUE INDEX ON cds(gi,number);
+CREATE UNIQUE INDEX ON cds(accession,accession_ver,number);
+CREATE INDEX ON cds(accession);
+
+CREATE INDEX ON cds(accession);
+CREATE INDEX ON cds(gene);
+CREATE INDEX ON cds(start);
+CREATE INDEX ON cds(stop);
+CREATE INDEX ON cds(gi);
+CREATE INDEX ON cds(start,stop);
+CREATE INDEX ON cds(chr);
+
+-- CREATE UNIQUE INDEX ON mrna(accession,ctg_gi,number);
+-- CREATE INDEX ON mrna(gi,number);
+-- CREATE INDEX ON mrna(accession,accession_ver,number);
+CREATE INDEX ON mrna(accession);
+CREATE INDEX ON mrna(gene);
+CREATE INDEX ON mrna(start);
+CREATE INDEX ON mrna(stop);
+CREATE INDEX ON mrna(gi);
+CREATE INDEX ON mrna(start,stop);
+CREATE INDEX ON mrna(chr);
\ No newline at end of file
diff --git a/utils/human_mrna_cds_insert.pl b/utils/human_mrna_cds_insert.pl
new file mode 100755 (executable)
index 0000000..ad15784
--- /dev/null
@@ -0,0 +1,150 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+
+# psql snp -c "COPY mrna_cds_table FROM STDIN WITH DELIMITER '   ' NULL AS 'NULL'";
+
+my $prot_table;
+
+while (<DATA>) {
+    chomp;
+    my ($codon,$aa,$aa_med,$aa_long) = split /\t/;
+    $prot_table->{$codon} = $aa;
+}
+
+my $current_stanza;
+while (<>) {
+    $current_stanza .= $_;
+    if ($_ =~ m{^//$}) {
+       handle_mrna_stanza($current_stanza);
+       $current_stanza = '';
+    }
+}
+
+sub handle_mrna_stanza{
+    my ($current_stanza) = @_;
+
+    my ($accession,$version,$gi) =
+       $current_stanza =~ m/^VERSION\s+([^\.]+)\.(\d+)\s+GI:(\d+)$/m;
+    my ($gene) =
+       $current_stanza =~ m/^\s+\/gene=\"?([^"]+)\"?$/m;
+    my ($cds) =
+       $current_stanza =~ m/^\s+CDS\s+(\d+(?:\.\.\d+))$/m;
+    return if not defined $cds;
+    my ($start,$stop) = split /\.\./,$cds;
+    return if not defined $start;
+    if (not defined $stop) {
+       $stop = $start;
+    }
+    my ($sequence) =
+       $current_stanza =~ m/^(ORIGIN.+)$/sm;
+    $sequence =~ s{\s*//\s*$}{};
+    $sequence =~ s{^ORIGIN}{};
+    $sequence =~ s{[\d\s\n]+}{}g;
+    $sequence = uc($sequence);
+    my $sequence2 = substr $sequence,($start-1),($stop-$start+1);
+    # There are other posibilities, like CTG, ACG, etc; so we can't test this.
+    # if ($sequence2 !~ /^A[UT]G/i) {
+    #  print STDERR $sequence2."\n";
+    #  print STDERR $gene."\n";
+    #  die "Sequence doesn't start with a start codon";
+    # }
+    if ($sequence2 !~ /(?:[TU]AA|[TU]AG|[TU]GA)$/) {
+       print STDERR $sequence2."\n";
+       print STDERR $gene."\n";
+       die "Sequence doesn't end with a stop codon";
+    }
+    # figure out cds gi
+    my $cds_gi='NULL';
+    my ($cds_stanza) = $current_stanza =~ /^\s{5}CDS\s+(.+?)^(?:\s{5}\w+|ORIGIN)/ms;
+    if (length $cds_stanza) {
+       ($cds_gi) = $cds_stanza =~ /db_xref="GI:(\d+)"/;
+       if (not defined $cds_gi) {
+           $cds_gi = 'NULL';
+       }
+    }
+    print join("\t",$gi,$cds_gi,$accession,$version,$gene,$start,$stop,length($sequence2), uc($sequence2),convert_to_prot_seq($sequence2))."\n";
+}
+
+sub convert_to_prot_seq{
+    my ($mrna) = @_;
+$mrna =~ s/U/T/g;
+    my $prot = '';
+    for my $pos (0..(length($mrna) / 3 - 1)) {
+       my $codon = substr($mrna,$pos*3,3);
+       if (not exists $prot_table->{$codon}) {
+          $prot .= 'N';
+       }
+       else {
+           $prot .= $prot_table->{$codon};
+       }
+    }
+    return $prot;
+}
+
+__DATA__
+ATT    I       Ile     Isoleucine
+ATC    I       Ile     Isoleucine
+ATA    I       Ile     Isoleucine
+CTT    L       Leu     Leucine
+CTC    L       Leu     Leucine
+CTA    L       Leu     Leucine
+CTG    L       Leu     Leucine
+TTA    L       Leu     Leucine
+TTG    L       Leu     Leucine
+GTT    V       Val     Valine
+GTC    V       Val     Valine
+GTA    V       Val     Valine
+GTG    V       Val     Valine
+TTT    F       Phe     Phenylalanine
+TTC    F       Phe     Phenylalanine
+ATG    M       Met     Methionine
+TGT    C       Cys     Cysteine
+TGC    C       Cys     Cysteine
+GCT    A       Ala     Alanine
+GCC    A       Ala     Alanine
+GCA    A       Ala     Alanine
+GCG    A       Ala     Alanine
+GGT    G       Gly     Glycine
+GGC    G       Gly     Glycine
+GGA    G       Gly     Glycine
+GGG    G       Gly     Glycine
+CCT    P       Pro     Proline
+CCC    P       Pro     Proline
+CCA    P       Pro     Proline
+CCG    P       Pro     Proline
+ACT    T       Thr     Threonine
+ACC    T       Thr     Threonine
+ACA    T       Thr     Threonine
+ACG    T       Thr     Threonine
+TCT    S       Ser     Serine
+TCC    S       Ser     Serine
+TCA    S       Ser     Serine
+TCG    S       Ser     Serine
+AGT    S       Ser     Serine
+AGC    S       Ser     Serine
+TAT    Y       Tyr     Tyrosine
+TAC    Y       Tyr     Tyrosine
+TGG    W       Trp     Tryptophan
+CAA    Q       Gln     Glutamine
+CAG    Q       Gln     Glutamine
+AAT    N       Asn     Asparagine
+AAC    N       Asn     Asparagine
+CAT    H       His     Histidine
+CAC    H       His     Histidine
+GAA    E       Glu     Glutamic acid
+GAG    E       Glu     Glutamic acid
+GAT    D       Asp     Aspartic acid
+GAC    D       Asp     Aspartic acid
+AAA    K       Lys     Lysine
+AAG    K       Lys     Lysine
+CGT    R       Arg     Arginine
+CGC    R       Arg     Arginine
+CGA    R       Arg     Arginine
+CGG    R       Arg     Arginine
+AGA    R       Arg     Arginine
+AGG    R       Arg     Arginine
+TAA    *       *       Stop codon
+TAG    *       *       Stop codon
+TGA    *       *       Stop codon
diff --git a/utils/intron_exon_loader.pl b/utils/intron_exon_loader.pl
new file mode 100755 (executable)
index 0000000..99ace9d
--- /dev/null
@@ -0,0 +1,333 @@
+#!/usr/bin/perl
+# intron_exon_loader makes loadable mrna and cds files, and is
+# released under the terms of the GPL version 2, or any later version,
+# at your option. See the file README and COPYING for more
+# information.
+# Copyright 2011 by Don Armstrong <don@donarmstrong.com>.
+
+use warnings;
+use strict;
+
+use Getopt::Long;
+use Pod::Usage;
+
+=head1 NAME
+
+intron_exon_loader - Make SQL loadable mrna and cds files
+
+=head1 SYNOPSIS
+
+ [options]
+
+ Options:
+  --debug, -d debugging level (Default 0)
+  --help, -h display this help
+  --man, -m display manual
+
+=head1 OPTIONS
+
+=over
+
+=item B<--debug, -d>
+
+Debug verbosity. (Default 0)
+
+=item B<--help, -h>
+
+Display brief usage information.
+
+=item B<--man, -m>
+
+Display this manual.
+
+=back
+
+=head1 EXAMPLES
+
+ zcat CHR_*/hs_ref_GRCh37.p2_*.gbs.gz | \
+ intron_exon_loader| \
+ psql snp -c "COPY mrna FROM STDIN WITH DELIMITER '   ' NULL AS 'NULL'";
+
+ zcat CHR_*/hs_ref_GRCh37.p2_*.gbs.gz | \
+ intron_exon_loader --output cds| \
+ psql snp -c "COPY cds FROM STDIN WITH DELIMITER '   ' NULL AS 'NULL'";
+
+
+=cut
+
+use vars qw($DEBUG);
+
+my %options = (type            => 'mrna',
+              debug           => 0,
+              help            => 0,
+              man             => 0,
+              );
+
+GetOptions(\%options,
+          'type|t=s',
+          'debug|d+','help|h|?','man|m');
+
+pod2usage() if $options{help};
+pod2usage({verbose=>2}) if $options{man};
+
+$DEBUG = $options{debug};
+
+my @USAGE_ERRORS;
+if ($options{type} !~ /^(mrna|cds)$/) {
+    push @USAGE_ERRORS, "--type must be one of mrna or cds, not '$options{type}'";
+}
+pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
+
+use DBI;
+use Data::Dumper;
+
+my $dbh = DBI->connect("dbi:Pg:dbname=snp",'','',{AutoCommit => 0}) or
+    die "Unable to connect to database".$DBI::errstr;
+
+my $sth = $dbh->prepare(<<'END') // die "Unable to prepare snpinfo statement: ".$dbh->errstr;
+SELECT * FROM contigs where gi=?;
+END
+
+sub main {
+    my ($options) = @_;
+    my %current_contig;
+    my $in_stanza = 0;
+    my $skip_until_next_contig;
+    my $current_stanza = '';
+    my $type = lc($options{type});
+    my $stanza_type = $type eq 'mrna' ? 'mRNA' : 'CDS';
+    while (<>) {
+       chomp;
+       if (/^VERSION/) {
+           $skip_until_next_contig = 0;
+           my ($acc,$ver,$gi) = $_ =~ m/^VERSION\s+([A-Z0-9_]+)\.(\d+)\s+GI\:(\d+)/;
+           if (not defined $gi) {
+               die "No GI in '$_' on line $.";
+           }
+           %current_contig = (acc => $acc,
+                              ver => $ver,
+                              gi  => $gi,
+                             );
+           %current_contig = contig_information($dbh,$sth,%current_contig);
+           if (not defined $current_contig{chr_start}) {
+               $skip_until_next_contig = 1;
+           }
+       }
+       next if $skip_until_next_contig;
+       if (/^\s{5}(\S+)/) {
+           if ($1 eq $stanza_type) {
+               if (length $current_stanza and $in_stanza) {
+                   output_current_stanza($type,$current_stanza,\%current_contig);
+                   $current_stanza = '';
+                   $in_stanza = 0;
+               }
+               $in_stanza = 1;
+               s/^\s{5}\S+//;
+               s/^\s+//;
+               $current_stanza = $_."\n";
+           } elsif (length $current_stanza) {
+               output_current_stanza($type,$current_stanza,\%current_contig);
+               $current_stanza = '';
+               $in_stanza = 0;
+           } else {
+               $in_stanza = 0;
+           }
+       }
+       elsif (/^[A-Z]/ and length $current_stanza) {
+           output_current_stanza($type,$current_stanza,\%current_contig);
+           $current_stanza = '';
+           $in_stanza = 0;
+       } else {
+           if ($in_stanza) {
+               $current_stanza .= $_."\n";
+           }
+       }
+    }
+}
+
+main();
+
+sub contig_information{
+    my ($dbh,$sth,%current_contig) = @_;
+
+    my $rv = $sth->execute($current_contig{gi}) or
+       die "Unable to execute statement properly: ".$dbh->errstr;
+
+    my $results = $sth->fetchall_hashref('gi');
+    if ($sth->err) {
+       print STDERR $sth->errstr;
+    }
+    if (exists $results->{$current_contig{gi}}) {
+       %current_contig = (%current_contig,
+                          %{$results->{$current_contig{gi}}},
+                         );
+    }
+    return %current_contig;
+}
+
+sub output_current_stanza{
+    my ($type,$current_stanza,$current_contig) = @_;
+    $current_stanza =~ s/^[\s\t]+//mg;
+    $current_stanza =~ s/\n\s*([^\/])/ $1/mg;
+    my ($extent,@other_fields) = split /\n/,$current_stanza;
+    $extent =~ s/\s+//g;
+    #      print "extent: $extent\n";
+    # figure out extent
+    my $complement = $extent =~ m/complement/;
+    $extent =~ s/complement\((.+)\)$/$1/;
+    # strip out the join, because we don't care about it
+    $extent =~ s/join\((.+)\)$/$1/;
+    # handle fields
+    my %fields = map {my ($key,$value) = $_ =~ m{\/([^=]+)="?(.+?)"?$}s;
+                     (not defined $key)?():(($key eq 'db_xref' && $value =~ m/([^:]+):(.+)/)?(lc($1),$2):(lc($key),$value))} @other_fields;
+    return if not exists $fields{product};
+    if (exists $fields{transcript_id}) {
+       ($fields{accession},$fields{accession_ver}) = $fields{transcript_id} =~ m/(.+)\.(\d+)/;
+    }
+    elsif (exists $fields{protein_id}) {
+       ($fields{accession},$fields{accession_ver}) = $fields{protein_id} =~ m/(.+)\.(\d+)/;
+    }
+    else {
+       return;
+    }
+    my ($variant) = $fields{product} =~ m/variant\s*(\d+)/i;
+    if (not defined $variant or not length $variant) {
+       ($variant) = $fields{product} =~ m/isoform\s*([^\s]+)/;
+    }
+    if (not defined $variant or not length $variant) { # or $variant !~ /^\d+$/) {
+       $variant = 'NULL';
+    }
+    my @extents = split /\s*\,\s*/,$extent;
+    if ($type eq 'mrna') {
+       output_current_mrna(\@extents,$variant,$complement,\%fields,$current_contig);
+    }
+    else {
+       output_current_cds(\@extents,$variant,$complement,\%fields,$current_contig);
+    }
+}
+
+sub output_current_mrna{
+    my ($extents,$variant,$complement,$fields,$current_contig) = @_;
+           # parse this out
+# gi is 224514618 (start 10001, stop 267719)
+#          complement(join(126698..126805,126953..129696,
+#                      129790..129847,130075..130566))
+#                      /gene="LOC729737"
+#                      /product="hypothetical protein LOC729737"
+#                      /note="Derived by automated computational analysis using
+#                      gene prediction method: GNOMON. Supporting evidence
+#                      includes similarity to: 2 mRNAs, 34 ESTs, 1 Protein"
+#                      /transcript_id="AM_001133863.3"
+#                      /db_xref="GI:310118477"
+    #                      /db_xref="Aeneid:729737"
+    my @output_fields = (@{$fields}{qw(gi accession accession_ver)},
+                        $variant,
+                        $complement ? 'TRUE':'FALSE',
+                        @{$fields}{qw(gene geneid)},
+                        @{$current_contig}{qw(gi chr)},
+                       );
+    my $previous_stop = undef;
+    my $previous_start = undef;
+    my $number = 1;
+    my @exons = @{$extents};
+    if ($complement) {
+       @exons = reverse @exons;
+    }
+    for my $exon (@exons) {
+       # strip off non-numerics like '>' and '<' here,
+       # because we don't want to deal with them
+       my ($start,$stop) = map {m/(\d+)/; $1;} split /\.\./,$exon;
+       if (not defined $stop) {
+           $stop = $start;
+       }
+       if (defined $previous_stop and $previous_stop > 0 and
+           defined $previous_start and $previous_start > 0
+          ) {
+           my $intron_start = $previous_stop+1;
+           my $intron_stop = $start-1;
+           if ($complement) {
+               $intron_start = $stop+1;
+               $intron_stop = $previous_start-1;
+           }
+           print join("\t",
+                      @output_fields,
+                      'FALSE',
+                      $number-1,
+                      $current_contig->{chr_start}+$intron_start-1,
+                      $current_contig->{chr_start}+$intron_stop-1,
+                      $intron_start,
+                      $intron_stop,
+                     )."\n";
+       }
+       print join("\t",
+                  @output_fields,
+                  'TRUE',
+                  $number,
+                  $current_contig->{chr_start}+$start-1,
+                  $current_contig->{chr_start}+$stop-1,
+                  $start,
+                  $stop,
+                 )."\n";
+       $number++;
+       $previous_stop = $stop;
+       $previous_start = $start;
+    }
+}
+
+
+sub output_current_cds {
+    my ($extents,$variant,$complement,$fields,$current_contig) = @_;
+           # parse this out
+# gi is 224514618 (start 10001, stop 267719)
+#          complement(join(126698..126805,126953..129696,
+#                      129790..129847,130075..130566))
+#                      /gene="LOC729737"
+#                      /product="hypothetical protein LOC729737"
+#                      /note="Derived by automated computational analysis using
+#                      gene prediction method: GNOMON. Supporting evidence
+#                      includes similarity to: 2 mRNAs, 34 ESTs, 1 Protein"
+#                      /transcript_id="AM_001133863.3"
+#                      /db_xref="GI:310118477"
+    #                      /db_xref="Aeneid:729737"
+    if (not exists $fields->{protein_id}) {
+       return;
+    }
+    my @output_fields = (@{$fields}{qw(gi accession accession_ver)},
+                        $variant,
+                        $complement ? 'TRUE':'FALSE',
+                        @{$fields}{qw(gene geneid)},
+                        @{$current_contig}{qw(gi chr)},
+                       );
+    my $previous_stop = undef;
+    my $previous_start = undef;
+    my $number = 1;
+    my @exons = @{$extents};
+    if ($complement) {
+       @exons = reverse @exons;
+    }
+    my $position = 0;
+    my $current_phase = 0;
+    for my $exon (@exons) {
+       # strip off non-numerics like '>' and '<' here,
+       # because we don't want to deal with them
+       my ($start,$stop) = map {m/(\d+)/; $1;} split /\.\./,$exon;
+       if (not defined $stop) {
+           $stop = $start;
+       }
+       print join("\t",
+                  @output_fields,
+                  $number,
+                  $current_phase,
+                  $current_contig->{chr_start}+$start-1,
+                  $current_contig->{chr_start}+$stop-1,
+                  $start,
+                  $stop,
+                  $position,
+                 )."\n";
+       $current_phase = ($current_phase + $stop - $start + 1) % 3;
+       $number++;
+       $previous_stop = $stop;
+       $previous_start = $start;
+       $position += abs($start-$stop)+1;
+    }
+}