From: Don Armstrong <don@donarmstrong.com>
Date: Tue, 13 Dec 2011 22:31:41 +0000 (-0800)
Subject: add intron/exon schema and mrna cds loader
X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=95ae8613ee3ff948d36ce633404102c3ad90881f;p=dbsnp.git

add intron/exon schema and mrna cds loader
---

diff --git a/schema/extra_schema/intron_exon_schema.sql b/schema/extra_schema/intron_exon_schema.sql
new file mode 100644
index 0000000..e1aed7f
--- /dev/null
+++ b/schema/extra_schema/intron_exon_schema.sql
@@ -0,0 +1,56 @@
+CREATE TABLE contigs (
+       tax_id int NOT NULL,
+       chr VARCHAR(127) NOT NULL,
+       chr_start INT NOT NULL,
+       chr_stop INT NOT NULL,
+       orientation VARCHAR(1) NOT NULL DEFAULT '+',
+       feature_name VARCHAR(127) NOT NULL,
+       gi INT,
+       feature_type VARCHAR(127) NOT NULL
+);
+
+CREATE TABLE exons (
+       gi int NOT NULL,
+       accession VARCHAR(127) NOT NULL,
+       accession_ver INT NOT NULL,
+       number INT NOT NULL,
+       start INT NOT NULL,
+       stop INT NOT NULL	
+       );
+
+CREATE TABLE cds (
+       gi int NOT NULL,
+       accession VARCHAR(127) NOT NULL,
+       accession_ver INT NOT NULL,
+       variant VARCHAR(127),
+       complement BOOLEAN DEFAULT FALSE,
+       gene VARCHAR (127),
+       geneid INT NOT NULL,
+       contig_gi INT NOT NULL,
+       chr VARCHAR (127),
+       number INT NOT NULL,
+       phase INT NOT NULL,
+       start INT NOT NULL,
+       stop INT NOT NULL,
+       contig_start INT NOT NULL,	
+       contig_stop INT NOT NULL,
+       position INT NOT NULL);
+
+CREATE TABLE mrna (
+       gi int NOT NULL,
+       accession VARCHAR(127) NOT NULL,
+       accession_ver INT NOT NULL,
+       variant INT,
+       complement BOOLEAN DEFAULT FALSE,
+       gene VARCHAR (127),
+       geneid INT NOT NULL,
+       contig_gi INT NOT NULL,
+       chr VARCHAR (127),
+       exon BOOLEAN DEFAULT TRUE,
+       number INT NOT NULL,
+       start INT NOT NULL,
+       stop INT NOT NULL,
+       contig_start INT NOT NULL,	
+       contig_stop INT NOT NULL);
+
+
diff --git a/schema/extra_schema/intron_exon_schema_indexes.sql b/schema/extra_schema/intron_exon_schema_indexes.sql
new file mode 100644
index 0000000..dd2ba09
--- /dev/null
+++ b/schema/extra_schema/intron_exon_schema_indexes.sql
@@ -0,0 +1,30 @@
+CREATE UNIQUE INDEX ON contigs(gi);
+CREATE INDEX ON contigs(feature_name);
+CREATE INDEX ON contigs(chr);
+
+CREATE UNIQUE INDEX ON exons(gi,number);
+CREATE UNIQUE INDEX ON exons(accession,accession_ver,number);
+CREATE INDEX ON exons(accession);
+
+CREATE UNIQUE INDEX ON cds(gi,number);
+CREATE UNIQUE INDEX ON cds(accession,accession_ver,number);
+CREATE INDEX ON cds(accession);
+
+CREATE INDEX ON cds(accession);
+CREATE INDEX ON cds(gene);
+CREATE INDEX ON cds(start);
+CREATE INDEX ON cds(stop);
+CREATE INDEX ON cds(gi);
+CREATE INDEX ON cds(start,stop);
+CREATE INDEX ON cds(chr);
+
+-- CREATE UNIQUE INDEX ON mrna(accession,ctg_gi,number);
+-- CREATE INDEX ON mrna(gi,number);
+-- CREATE INDEX ON mrna(accession,accession_ver,number);
+CREATE INDEX ON mrna(accession);
+CREATE INDEX ON mrna(gene);
+CREATE INDEX ON mrna(start);
+CREATE INDEX ON mrna(stop);
+CREATE INDEX ON mrna(gi);
+CREATE INDEX ON mrna(start,stop);
+CREATE INDEX ON mrna(chr);
\ No newline at end of file
diff --git a/utils/human_mrna_cds_insert.pl b/utils/human_mrna_cds_insert.pl
new file mode 100755
index 0000000..ad15784
--- /dev/null
+++ b/utils/human_mrna_cds_insert.pl
@@ -0,0 +1,150 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+
+# psql snp -c "COPY mrna_cds_table FROM STDIN WITH DELIMITER '   ' NULL AS 'NULL'";
+
+my $prot_table;
+
+while (<DATA>) {
+    chomp;
+    my ($codon,$aa,$aa_med,$aa_long) = split /\t/;
+    $prot_table->{$codon} = $aa;
+}
+
+my $current_stanza;
+while (<>) {
+    $current_stanza .= $_;
+    if ($_ =~ m{^//$}) {
+	handle_mrna_stanza($current_stanza);
+	$current_stanza = '';
+    }
+}
+
+sub handle_mrna_stanza{
+    my ($current_stanza) = @_;
+
+    my ($accession,$version,$gi) =
+	$current_stanza =~ m/^VERSION\s+([^\.]+)\.(\d+)\s+GI:(\d+)$/m;
+    my ($gene) =
+	$current_stanza =~ m/^\s+\/gene=\"?([^"]+)\"?$/m;
+    my ($cds) =
+	$current_stanza =~ m/^\s+CDS\s+(\d+(?:\.\.\d+))$/m;
+    return if not defined $cds;
+    my ($start,$stop) = split /\.\./,$cds;
+    return if not defined $start;
+    if (not defined $stop) {
+	$stop = $start;
+    }
+    my ($sequence) =
+	$current_stanza =~ m/^(ORIGIN.+)$/sm;
+    $sequence =~ s{\s*//\s*$}{};
+    $sequence =~ s{^ORIGIN}{};
+    $sequence =~ s{[\d\s\n]+}{}g;
+    $sequence = uc($sequence);
+    my $sequence2 = substr $sequence,($start-1),($stop-$start+1);
+    # There are other posibilities, like CTG, ACG, etc; so we can't test this.
+    # if ($sequence2 !~ /^A[UT]G/i) {
+    # 	print STDERR $sequence2."\n";
+    # 	print STDERR $gene."\n";
+    # 	die "Sequence doesn't start with a start codon";
+    # }
+    if ($sequence2 !~ /(?:[TU]AA|[TU]AG|[TU]GA)$/) {
+	print STDERR $sequence2."\n";
+	print STDERR $gene."\n";
+	die "Sequence doesn't end with a stop codon";
+    }
+    # figure out cds gi
+    my $cds_gi='NULL';
+    my ($cds_stanza) = $current_stanza =~ /^\s{5}CDS\s+(.+?)^(?:\s{5}\w+|ORIGIN)/ms;
+    if (length $cds_stanza) {
+	($cds_gi) = $cds_stanza =~ /db_xref="GI:(\d+)"/;
+	if (not defined $cds_gi) {
+	    $cds_gi = 'NULL';
+	}
+    }
+    print join("\t",$gi,$cds_gi,$accession,$version,$gene,$start,$stop,length($sequence2), uc($sequence2),convert_to_prot_seq($sequence2))."\n";
+}
+
+sub convert_to_prot_seq{
+    my ($mrna) = @_;
+$mrna =~ s/U/T/g;
+    my $prot = '';
+    for my $pos (0..(length($mrna) / 3 - 1)) {
+	my $codon = substr($mrna,$pos*3,3);
+	if (not exists $prot_table->{$codon}) {
+	   $prot .= 'N';
+	}
+	else {
+	    $prot .= $prot_table->{$codon};
+	}
+    }
+    return $prot;
+}
+
+__DATA__
+ATT	I	Ile	Isoleucine
+ATC	I	Ile	Isoleucine
+ATA	I	Ile	Isoleucine
+CTT	L	Leu	Leucine
+CTC	L	Leu	Leucine
+CTA	L	Leu	Leucine
+CTG	L	Leu	Leucine
+TTA	L	Leu	Leucine
+TTG	L	Leu	Leucine
+GTT	V	Val	Valine
+GTC	V	Val	Valine
+GTA	V	Val	Valine
+GTG	V	Val	Valine
+TTT	F	Phe	Phenylalanine
+TTC	F	Phe	Phenylalanine
+ATG	M	Met	Methionine
+TGT	C	Cys	Cysteine
+TGC	C	Cys	Cysteine
+GCT	A	Ala	Alanine
+GCC	A	Ala	Alanine
+GCA	A	Ala	Alanine
+GCG	A	Ala	Alanine
+GGT	G	Gly	Glycine
+GGC	G	Gly	Glycine
+GGA	G	Gly	Glycine
+GGG	G	Gly	Glycine
+CCT	P	Pro	Proline
+CCC	P	Pro	Proline
+CCA	P	Pro	Proline
+CCG	P	Pro	Proline
+ACT	T	Thr	Threonine
+ACC	T	Thr	Threonine
+ACA	T	Thr	Threonine
+ACG	T	Thr	Threonine
+TCT	S	Ser	Serine
+TCC	S	Ser	Serine
+TCA	S	Ser	Serine
+TCG	S	Ser	Serine
+AGT	S	Ser	Serine
+AGC	S	Ser	Serine
+TAT	Y	Tyr	Tyrosine
+TAC	Y	Tyr	Tyrosine
+TGG	W	Trp	Tryptophan
+CAA	Q	Gln	Glutamine
+CAG	Q	Gln	Glutamine
+AAT	N	Asn	Asparagine
+AAC	N	Asn	Asparagine
+CAT	H	His	Histidine
+CAC	H	His	Histidine
+GAA	E	Glu	Glutamic acid
+GAG	E	Glu	Glutamic acid
+GAT	D	Asp	Aspartic acid
+GAC	D	Asp	Aspartic acid
+AAA	K	Lys	Lysine
+AAG	K	Lys	Lysine
+CGT	R	Arg	Arginine
+CGC	R	Arg	Arginine
+CGA	R	Arg	Arginine
+CGG	R	Arg	Arginine
+AGA	R	Arg	Arginine
+AGG	R	Arg	Arginine
+TAA	*	*	Stop codon
+TAG	*	*	Stop codon
+TGA	*	*	Stop codon
diff --git a/utils/intron_exon_loader.pl b/utils/intron_exon_loader.pl
new file mode 100755
index 0000000..99ace9d
--- /dev/null
+++ b/utils/intron_exon_loader.pl
@@ -0,0 +1,333 @@
+#!/usr/bin/perl
+# intron_exon_loader makes loadable mrna and cds files, and is
+# released under the terms of the GPL version 2, or any later version,
+# at your option. See the file README and COPYING for more
+# information.
+# Copyright 2011 by Don Armstrong <don@donarmstrong.com>.
+
+use warnings;
+use strict;
+
+use Getopt::Long;
+use Pod::Usage;
+
+=head1 NAME
+
+intron_exon_loader - Make SQL loadable mrna and cds files
+
+=head1 SYNOPSIS
+
+ [options]
+
+ Options:
+  --debug, -d debugging level (Default 0)
+  --help, -h display this help
+  --man, -m display manual
+
+=head1 OPTIONS
+
+=over
+
+=item B<--debug, -d>
+
+Debug verbosity. (Default 0)
+
+=item B<--help, -h>
+
+Display brief usage information.
+
+=item B<--man, -m>
+
+Display this manual.
+
+=back
+
+=head1 EXAMPLES
+
+ zcat CHR_*/hs_ref_GRCh37.p2_*.gbs.gz | \
+ intron_exon_loader| \
+ psql snp -c "COPY mrna FROM STDIN WITH DELIMITER '   ' NULL AS 'NULL'";
+
+ zcat CHR_*/hs_ref_GRCh37.p2_*.gbs.gz | \
+ intron_exon_loader --output cds| \
+ psql snp -c "COPY cds FROM STDIN WITH DELIMITER '   ' NULL AS 'NULL'";
+
+
+=cut
+
+use vars qw($DEBUG);
+
+my %options = (type            => 'mrna',
+	       debug           => 0,
+	       help            => 0,
+	       man             => 0,
+	       );
+
+GetOptions(\%options,
+	   'type|t=s',
+	   'debug|d+','help|h|?','man|m');
+
+pod2usage() if $options{help};
+pod2usage({verbose=>2}) if $options{man};
+
+$DEBUG = $options{debug};
+
+my @USAGE_ERRORS;
+if ($options{type} !~ /^(mrna|cds)$/) {
+    push @USAGE_ERRORS, "--type must be one of mrna or cds, not '$options{type}'";
+}
+pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
+
+use DBI;
+use Data::Dumper;
+
+my $dbh = DBI->connect("dbi:Pg:dbname=snp",'','',{AutoCommit => 0}) or
+    die "Unable to connect to database".$DBI::errstr;
+
+my $sth = $dbh->prepare(<<'END') // die "Unable to prepare snpinfo statement: ".$dbh->errstr;
+SELECT * FROM contigs where gi=?;
+END
+
+sub main {
+    my ($options) = @_;
+    my %current_contig;
+    my $in_stanza = 0;
+    my $skip_until_next_contig;
+    my $current_stanza = '';
+    my $type = lc($options{type});
+    my $stanza_type = $type eq 'mrna' ? 'mRNA' : 'CDS';
+    while (<>) {
+	chomp;
+	if (/^VERSION/) {
+	    $skip_until_next_contig = 0;
+	    my ($acc,$ver,$gi) = $_ =~ m/^VERSION\s+([A-Z0-9_]+)\.(\d+)\s+GI\:(\d+)/;
+	    if (not defined $gi) {
+		die "No GI in '$_' on line $.";
+	    }
+	    %current_contig = (acc => $acc,
+			       ver => $ver,
+			       gi  => $gi,
+			      );
+	    %current_contig = contig_information($dbh,$sth,%current_contig);
+	    if (not defined $current_contig{chr_start}) {
+		$skip_until_next_contig = 1;
+	    }
+	}
+	next if $skip_until_next_contig;
+	if (/^\s{5}(\S+)/) {
+	    if ($1 eq $stanza_type) {
+		if (length $current_stanza and $in_stanza) {
+		    output_current_stanza($type,$current_stanza,\%current_contig);
+		    $current_stanza = '';
+		    $in_stanza = 0;
+		}
+		$in_stanza = 1;
+		s/^\s{5}\S+//;
+		s/^\s+//;
+		$current_stanza = $_."\n";
+	    } elsif (length $current_stanza) {
+		output_current_stanza($type,$current_stanza,\%current_contig);
+		$current_stanza = '';
+		$in_stanza = 0;
+	    } else {
+		$in_stanza = 0;
+	    }
+	}
+	elsif (/^[A-Z]/ and length $current_stanza) {
+	    output_current_stanza($type,$current_stanza,\%current_contig);
+	    $current_stanza = '';
+	    $in_stanza = 0;
+	} else {
+	    if ($in_stanza) {
+		$current_stanza .= $_."\n";
+	    }
+	}
+    }
+}
+
+main();
+
+sub contig_information{
+    my ($dbh,$sth,%current_contig) = @_;
+
+    my $rv = $sth->execute($current_contig{gi}) or
+	die "Unable to execute statement properly: ".$dbh->errstr;
+
+    my $results = $sth->fetchall_hashref('gi');
+    if ($sth->err) {
+	print STDERR $sth->errstr;
+    }
+    if (exists $results->{$current_contig{gi}}) {
+	%current_contig = (%current_contig,
+			   %{$results->{$current_contig{gi}}},
+			  );
+    }
+    return %current_contig;
+}
+
+sub output_current_stanza{
+    my ($type,$current_stanza,$current_contig) = @_;
+    $current_stanza =~ s/^[\s\t]+//mg;
+    $current_stanza =~ s/\n\s*([^\/])/ $1/mg;
+    my ($extent,@other_fields) = split /\n/,$current_stanza;
+    $extent =~ s/\s+//g;
+    #	    print "extent: $extent\n";
+    # figure out extent
+    my $complement = $extent =~ m/complement/;
+    $extent =~ s/complement\((.+)\)$/$1/;
+    # strip out the join, because we don't care about it
+    $extent =~ s/join\((.+)\)$/$1/;
+    # handle fields
+    my %fields = map {my ($key,$value) = $_ =~ m{\/([^=]+)="?(.+?)"?$}s;
+		      (not defined $key)?():(($key eq 'db_xref' && $value =~ m/([^:]+):(.+)/)?(lc($1),$2):(lc($key),$value))} @other_fields;
+    return if not exists $fields{product};
+    if (exists $fields{transcript_id}) {
+	($fields{accession},$fields{accession_ver}) = $fields{transcript_id} =~ m/(.+)\.(\d+)/;
+    }
+    elsif (exists $fields{protein_id}) {
+	($fields{accession},$fields{accession_ver}) = $fields{protein_id} =~ m/(.+)\.(\d+)/;
+    }
+    else {
+	return;
+    }
+    my ($variant) = $fields{product} =~ m/variant\s*(\d+)/i;
+    if (not defined $variant or not length $variant) {
+	($variant) = $fields{product} =~ m/isoform\s*([^\s]+)/;
+    }
+    if (not defined $variant or not length $variant) { # or $variant !~ /^\d+$/) {
+	$variant = 'NULL';
+    }
+    my @extents = split /\s*\,\s*/,$extent;
+    if ($type eq 'mrna') {
+	output_current_mrna(\@extents,$variant,$complement,\%fields,$current_contig);
+    }
+    else {
+	output_current_cds(\@extents,$variant,$complement,\%fields,$current_contig);
+    }
+}
+
+sub output_current_mrna{
+    my ($extents,$variant,$complement,$fields,$current_contig) = @_;
+	    # parse this out
+# gi is 224514618 (start 10001, stop 267719)
+# 	    complement(join(126698..126805,126953..129696,
+#                      129790..129847,130075..130566))
+#                      /gene="LOC729737"
+#                      /product="hypothetical protein LOC729737"
+#                      /note="Derived by automated computational analysis using
+#                      gene prediction method: GNOMON. Supporting evidence
+#                      includes similarity to: 2 mRNAs, 34 ESTs, 1 Protein"
+#                      /transcript_id="AM_001133863.3"
+#                      /db_xref="GI:310118477"
+    #                      /db_xref="Aeneid:729737"
+    my @output_fields = (@{$fields}{qw(gi accession accession_ver)},
+			 $variant,
+			 $complement ? 'TRUE':'FALSE',
+			 @{$fields}{qw(gene geneid)},
+			 @{$current_contig}{qw(gi chr)},
+			);
+    my $previous_stop = undef;
+    my $previous_start = undef;
+    my $number = 1;
+    my @exons = @{$extents};
+    if ($complement) {
+	@exons = reverse @exons;
+    }
+    for my $exon (@exons) {
+	# strip off non-numerics like '>' and '<' here,
+	# because we don't want to deal with them
+	my ($start,$stop) = map {m/(\d+)/; $1;} split /\.\./,$exon;
+	if (not defined $stop) {
+	    $stop = $start;
+	}
+	if (defined $previous_stop and $previous_stop > 0 and
+	    defined $previous_start and $previous_start > 0
+	   ) {
+	    my $intron_start = $previous_stop+1;
+	    my $intron_stop = $start-1;
+	    if ($complement) {
+		$intron_start = $stop+1;
+		$intron_stop = $previous_start-1;
+	    }
+	    print join("\t",
+		       @output_fields,
+		       'FALSE',
+		       $number-1,
+		       $current_contig->{chr_start}+$intron_start-1,
+		       $current_contig->{chr_start}+$intron_stop-1,
+		       $intron_start,
+		       $intron_stop,
+		      )."\n";
+	}
+	print join("\t",
+		   @output_fields,
+		   'TRUE',
+		   $number,
+		   $current_contig->{chr_start}+$start-1,
+		   $current_contig->{chr_start}+$stop-1,
+		   $start,
+		   $stop,
+		  )."\n";
+	$number++;
+	$previous_stop = $stop;
+	$previous_start = $start;
+    }
+}
+
+
+sub output_current_cds {
+    my ($extents,$variant,$complement,$fields,$current_contig) = @_;
+	    # parse this out
+# gi is 224514618 (start 10001, stop 267719)
+# 	    complement(join(126698..126805,126953..129696,
+#                      129790..129847,130075..130566))
+#                      /gene="LOC729737"
+#                      /product="hypothetical protein LOC729737"
+#                      /note="Derived by automated computational analysis using
+#                      gene prediction method: GNOMON. Supporting evidence
+#                      includes similarity to: 2 mRNAs, 34 ESTs, 1 Protein"
+#                      /transcript_id="AM_001133863.3"
+#                      /db_xref="GI:310118477"
+    #                      /db_xref="Aeneid:729737"
+    if (not exists $fields->{protein_id}) {
+	return;
+    }
+    my @output_fields = (@{$fields}{qw(gi accession accession_ver)},
+			 $variant,
+			 $complement ? 'TRUE':'FALSE',
+			 @{$fields}{qw(gene geneid)},
+			 @{$current_contig}{qw(gi chr)},
+			);
+    my $previous_stop = undef;
+    my $previous_start = undef;
+    my $number = 1;
+    my @exons = @{$extents};
+    if ($complement) {
+	@exons = reverse @exons;
+    }
+    my $position = 0;
+    my $current_phase = 0;
+    for my $exon (@exons) {
+	# strip off non-numerics like '>' and '<' here,
+	# because we don't want to deal with them
+	my ($start,$stop) = map {m/(\d+)/; $1;} split /\.\./,$exon;
+	if (not defined $stop) {
+	    $stop = $start;
+	}
+	print join("\t",
+		   @output_fields,
+		   $number,
+		   $current_phase,
+		   $current_contig->{chr_start}+$start-1,
+		   $current_contig->{chr_start}+$stop-1,
+		   $start,
+		   $stop,
+		   $position,
+		  )."\n";
+	$current_phase = ($current_phase + $stop - $start + 1) % 3;
+	$number++;
+	$previous_stop = $stop;
+	$previous_start = $start;
+	$position += abs($start-$stop)+1;
+    }
+}