From: Don Armstrong Date: Tue, 13 Dec 2011 22:31:41 +0000 (-0800) Subject: add intron/exon schema and mrna cds loader X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=95ae8613ee3ff948d36ce633404102c3ad90881f;p=dbsnp.git add intron/exon schema and mrna cds loader --- diff --git a/schema/extra_schema/intron_exon_schema.sql b/schema/extra_schema/intron_exon_schema.sql new file mode 100644 index 0000000..e1aed7f --- /dev/null +++ b/schema/extra_schema/intron_exon_schema.sql @@ -0,0 +1,56 @@ +CREATE TABLE contigs ( + tax_id int NOT NULL, + chr VARCHAR(127) NOT NULL, + chr_start INT NOT NULL, + chr_stop INT NOT NULL, + orientation VARCHAR(1) NOT NULL DEFAULT '+', + feature_name VARCHAR(127) NOT NULL, + gi INT, + feature_type VARCHAR(127) NOT NULL +); + +CREATE TABLE exons ( + gi int NOT NULL, + accession VARCHAR(127) NOT NULL, + accession_ver INT NOT NULL, + number INT NOT NULL, + start INT NOT NULL, + stop INT NOT NULL + ); + +CREATE TABLE cds ( + gi int NOT NULL, + accession VARCHAR(127) NOT NULL, + accession_ver INT NOT NULL, + variant VARCHAR(127), + complement BOOLEAN DEFAULT FALSE, + gene VARCHAR (127), + geneid INT NOT NULL, + contig_gi INT NOT NULL, + chr VARCHAR (127), + number INT NOT NULL, + phase INT NOT NULL, + start INT NOT NULL, + stop INT NOT NULL, + contig_start INT NOT NULL, + contig_stop INT NOT NULL, + position INT NOT NULL); + +CREATE TABLE mrna ( + gi int NOT NULL, + accession VARCHAR(127) NOT NULL, + accession_ver INT NOT NULL, + variant INT, + complement BOOLEAN DEFAULT FALSE, + gene VARCHAR (127), + geneid INT NOT NULL, + contig_gi INT NOT NULL, + chr VARCHAR (127), + exon BOOLEAN DEFAULT TRUE, + number INT NOT NULL, + start INT NOT NULL, + stop INT NOT NULL, + contig_start INT NOT NULL, + contig_stop INT NOT NULL); + + diff --git a/schema/extra_schema/intron_exon_schema_indexes.sql b/schema/extra_schema/intron_exon_schema_indexes.sql new file mode 100644 index 0000000..dd2ba09 --- /dev/null +++ b/schema/extra_schema/intron_exon_schema_indexes.sql @@ -0,0 +1,30 @@ +CREATE UNIQUE INDEX ON contigs(gi); +CREATE INDEX ON contigs(feature_name); +CREATE INDEX ON contigs(chr); + +CREATE UNIQUE INDEX ON exons(gi,number); +CREATE UNIQUE INDEX ON exons(accession,accession_ver,number); +CREATE INDEX ON exons(accession); + +CREATE UNIQUE INDEX ON cds(gi,number); +CREATE UNIQUE INDEX ON cds(accession,accession_ver,number); +CREATE INDEX ON cds(accession); + +CREATE INDEX ON cds(accession); +CREATE INDEX ON cds(gene); +CREATE INDEX ON cds(start); +CREATE INDEX ON cds(stop); +CREATE INDEX ON cds(gi); +CREATE INDEX ON cds(start,stop); +CREATE INDEX ON cds(chr); + +-- CREATE UNIQUE INDEX ON mrna(accession,ctg_gi,number); +-- CREATE INDEX ON mrna(gi,number); +-- CREATE INDEX ON mrna(accession,accession_ver,number); +CREATE INDEX ON mrna(accession); +CREATE INDEX ON mrna(gene); +CREATE INDEX ON mrna(start); +CREATE INDEX ON mrna(stop); +CREATE INDEX ON mrna(gi); +CREATE INDEX ON mrna(start,stop); +CREATE INDEX ON mrna(chr); \ No newline at end of file diff --git a/utils/human_mrna_cds_insert.pl b/utils/human_mrna_cds_insert.pl new file mode 100755 index 0000000..ad15784 --- /dev/null +++ b/utils/human_mrna_cds_insert.pl @@ -0,0 +1,150 @@ +#!/usr/bin/perl + +use warnings; +use strict; + +# psql snp -c "COPY mrna_cds_table FROM STDIN WITH DELIMITER ' ' NULL AS 'NULL'"; + +my $prot_table; + +while () { + chomp; + my ($codon,$aa,$aa_med,$aa_long) = split /\t/; + $prot_table->{$codon} = $aa; +} + +my $current_stanza; +while (<>) { + $current_stanza .= $_; + if ($_ =~ m{^//$}) { + handle_mrna_stanza($current_stanza); + $current_stanza = ''; + } +} + +sub handle_mrna_stanza{ + my ($current_stanza) = @_; + + my ($accession,$version,$gi) = + $current_stanza =~ m/^VERSION\s+([^\.]+)\.(\d+)\s+GI:(\d+)$/m; + my ($gene) = + $current_stanza =~ m/^\s+\/gene=\"?([^"]+)\"?$/m; + my ($cds) = + $current_stanza =~ m/^\s+CDS\s+(\d+(?:\.\.\d+))$/m; + return if not defined $cds; + my ($start,$stop) = split /\.\./,$cds; + return if not defined $start; + if (not defined $stop) { + $stop = $start; + } + my ($sequence) = + $current_stanza =~ m/^(ORIGIN.+)$/sm; + $sequence =~ s{\s*//\s*$}{}; + $sequence =~ s{^ORIGIN}{}; + $sequence =~ s{[\d\s\n]+}{}g; + $sequence = uc($sequence); + my $sequence2 = substr $sequence,($start-1),($stop-$start+1); + # There are other posibilities, like CTG, ACG, etc; so we can't test this. + # if ($sequence2 !~ /^A[UT]G/i) { + # print STDERR $sequence2."\n"; + # print STDERR $gene."\n"; + # die "Sequence doesn't start with a start codon"; + # } + if ($sequence2 !~ /(?:[TU]AA|[TU]AG|[TU]GA)$/) { + print STDERR $sequence2."\n"; + print STDERR $gene."\n"; + die "Sequence doesn't end with a stop codon"; + } + # figure out cds gi + my $cds_gi='NULL'; + my ($cds_stanza) = $current_stanza =~ /^\s{5}CDS\s+(.+?)^(?:\s{5}\w+|ORIGIN)/ms; + if (length $cds_stanza) { + ($cds_gi) = $cds_stanza =~ /db_xref="GI:(\d+)"/; + if (not defined $cds_gi) { + $cds_gi = 'NULL'; + } + } + print join("\t",$gi,$cds_gi,$accession,$version,$gene,$start,$stop,length($sequence2), uc($sequence2),convert_to_prot_seq($sequence2))."\n"; +} + +sub convert_to_prot_seq{ + my ($mrna) = @_; +$mrna =~ s/U/T/g; + my $prot = ''; + for my $pos (0..(length($mrna) / 3 - 1)) { + my $codon = substr($mrna,$pos*3,3); + if (not exists $prot_table->{$codon}) { + $prot .= 'N'; + } + else { + $prot .= $prot_table->{$codon}; + } + } + return $prot; +} + +__DATA__ +ATT I Ile Isoleucine +ATC I Ile Isoleucine +ATA I Ile Isoleucine +CTT L Leu Leucine +CTC L Leu Leucine +CTA L Leu Leucine +CTG L Leu Leucine +TTA L Leu Leucine +TTG L Leu Leucine +GTT V Val Valine +GTC V Val Valine +GTA V Val Valine +GTG V Val Valine +TTT F Phe Phenylalanine +TTC F Phe Phenylalanine +ATG M Met Methionine +TGT C Cys Cysteine +TGC C Cys Cysteine +GCT A Ala Alanine +GCC A Ala Alanine +GCA A Ala Alanine +GCG A Ala Alanine +GGT G Gly Glycine +GGC G Gly Glycine +GGA G Gly Glycine +GGG G Gly Glycine +CCT P Pro Proline +CCC P Pro Proline +CCA P Pro Proline +CCG P Pro Proline +ACT T Thr Threonine +ACC T Thr Threonine +ACA T Thr Threonine +ACG T Thr Threonine +TCT S Ser Serine +TCC S Ser Serine +TCA S Ser Serine +TCG S Ser Serine +AGT S Ser Serine +AGC S Ser Serine +TAT Y Tyr Tyrosine +TAC Y Tyr Tyrosine +TGG W Trp Tryptophan +CAA Q Gln Glutamine +CAG Q Gln Glutamine +AAT N Asn Asparagine +AAC N Asn Asparagine +CAT H His Histidine +CAC H His Histidine +GAA E Glu Glutamic acid +GAG E Glu Glutamic acid +GAT D Asp Aspartic acid +GAC D Asp Aspartic acid +AAA K Lys Lysine +AAG K Lys Lysine +CGT R Arg Arginine +CGC R Arg Arginine +CGA R Arg Arginine +CGG R Arg Arginine +AGA R Arg Arginine +AGG R Arg Arginine +TAA * * Stop codon +TAG * * Stop codon +TGA * * Stop codon diff --git a/utils/intron_exon_loader.pl b/utils/intron_exon_loader.pl new file mode 100755 index 0000000..99ace9d --- /dev/null +++ b/utils/intron_exon_loader.pl @@ -0,0 +1,333 @@ +#!/usr/bin/perl +# intron_exon_loader makes loadable mrna and cds files, and is +# released under the terms of the GPL version 2, or any later version, +# at your option. See the file README and COPYING for more +# information. +# Copyright 2011 by Don Armstrong . + +use warnings; +use strict; + +use Getopt::Long; +use Pod::Usage; + +=head1 NAME + +intron_exon_loader - Make SQL loadable mrna and cds files + +=head1 SYNOPSIS + + [options] + + Options: + --debug, -d debugging level (Default 0) + --help, -h display this help + --man, -m display manual + +=head1 OPTIONS + +=over + +=item B<--debug, -d> + +Debug verbosity. (Default 0) + +=item B<--help, -h> + +Display brief usage information. + +=item B<--man, -m> + +Display this manual. + +=back + +=head1 EXAMPLES + + zcat CHR_*/hs_ref_GRCh37.p2_*.gbs.gz | \ + intron_exon_loader| \ + psql snp -c "COPY mrna FROM STDIN WITH DELIMITER ' ' NULL AS 'NULL'"; + + zcat CHR_*/hs_ref_GRCh37.p2_*.gbs.gz | \ + intron_exon_loader --output cds| \ + psql snp -c "COPY cds FROM STDIN WITH DELIMITER ' ' NULL AS 'NULL'"; + + +=cut + +use vars qw($DEBUG); + +my %options = (type => 'mrna', + debug => 0, + help => 0, + man => 0, + ); + +GetOptions(\%options, + 'type|t=s', + 'debug|d+','help|h|?','man|m'); + +pod2usage() if $options{help}; +pod2usage({verbose=>2}) if $options{man}; + +$DEBUG = $options{debug}; + +my @USAGE_ERRORS; +if ($options{type} !~ /^(mrna|cds)$/) { + push @USAGE_ERRORS, "--type must be one of mrna or cds, not '$options{type}'"; +} +pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS; + +use DBI; +use Data::Dumper; + +my $dbh = DBI->connect("dbi:Pg:dbname=snp",'','',{AutoCommit => 0}) or + die "Unable to connect to database".$DBI::errstr; + +my $sth = $dbh->prepare(<<'END') // die "Unable to prepare snpinfo statement: ".$dbh->errstr; +SELECT * FROM contigs where gi=?; +END + +sub main { + my ($options) = @_; + my %current_contig; + my $in_stanza = 0; + my $skip_until_next_contig; + my $current_stanza = ''; + my $type = lc($options{type}); + my $stanza_type = $type eq 'mrna' ? 'mRNA' : 'CDS'; + while (<>) { + chomp; + if (/^VERSION/) { + $skip_until_next_contig = 0; + my ($acc,$ver,$gi) = $_ =~ m/^VERSION\s+([A-Z0-9_]+)\.(\d+)\s+GI\:(\d+)/; + if (not defined $gi) { + die "No GI in '$_' on line $."; + } + %current_contig = (acc => $acc, + ver => $ver, + gi => $gi, + ); + %current_contig = contig_information($dbh,$sth,%current_contig); + if (not defined $current_contig{chr_start}) { + $skip_until_next_contig = 1; + } + } + next if $skip_until_next_contig; + if (/^\s{5}(\S+)/) { + if ($1 eq $stanza_type) { + if (length $current_stanza and $in_stanza) { + output_current_stanza($type,$current_stanza,\%current_contig); + $current_stanza = ''; + $in_stanza = 0; + } + $in_stanza = 1; + s/^\s{5}\S+//; + s/^\s+//; + $current_stanza = $_."\n"; + } elsif (length $current_stanza) { + output_current_stanza($type,$current_stanza,\%current_contig); + $current_stanza = ''; + $in_stanza = 0; + } else { + $in_stanza = 0; + } + } + elsif (/^[A-Z]/ and length $current_stanza) { + output_current_stanza($type,$current_stanza,\%current_contig); + $current_stanza = ''; + $in_stanza = 0; + } else { + if ($in_stanza) { + $current_stanza .= $_."\n"; + } + } + } +} + +main(); + +sub contig_information{ + my ($dbh,$sth,%current_contig) = @_; + + my $rv = $sth->execute($current_contig{gi}) or + die "Unable to execute statement properly: ".$dbh->errstr; + + my $results = $sth->fetchall_hashref('gi'); + if ($sth->err) { + print STDERR $sth->errstr; + } + if (exists $results->{$current_contig{gi}}) { + %current_contig = (%current_contig, + %{$results->{$current_contig{gi}}}, + ); + } + return %current_contig; +} + +sub output_current_stanza{ + my ($type,$current_stanza,$current_contig) = @_; + $current_stanza =~ s/^[\s\t]+//mg; + $current_stanza =~ s/\n\s*([^\/])/ $1/mg; + my ($extent,@other_fields) = split /\n/,$current_stanza; + $extent =~ s/\s+//g; + # print "extent: $extent\n"; + # figure out extent + my $complement = $extent =~ m/complement/; + $extent =~ s/complement\((.+)\)$/$1/; + # strip out the join, because we don't care about it + $extent =~ s/join\((.+)\)$/$1/; + # handle fields + my %fields = map {my ($key,$value) = $_ =~ m{\/([^=]+)="?(.+?)"?$}s; + (not defined $key)?():(($key eq 'db_xref' && $value =~ m/([^:]+):(.+)/)?(lc($1),$2):(lc($key),$value))} @other_fields; + return if not exists $fields{product}; + if (exists $fields{transcript_id}) { + ($fields{accession},$fields{accession_ver}) = $fields{transcript_id} =~ m/(.+)\.(\d+)/; + } + elsif (exists $fields{protein_id}) { + ($fields{accession},$fields{accession_ver}) = $fields{protein_id} =~ m/(.+)\.(\d+)/; + } + else { + return; + } + my ($variant) = $fields{product} =~ m/variant\s*(\d+)/i; + if (not defined $variant or not length $variant) { + ($variant) = $fields{product} =~ m/isoform\s*([^\s]+)/; + } + if (not defined $variant or not length $variant) { # or $variant !~ /^\d+$/) { + $variant = 'NULL'; + } + my @extents = split /\s*\,\s*/,$extent; + if ($type eq 'mrna') { + output_current_mrna(\@extents,$variant,$complement,\%fields,$current_contig); + } + else { + output_current_cds(\@extents,$variant,$complement,\%fields,$current_contig); + } +} + +sub output_current_mrna{ + my ($extents,$variant,$complement,$fields,$current_contig) = @_; + # parse this out +# gi is 224514618 (start 10001, stop 267719) +# complement(join(126698..126805,126953..129696, +# 129790..129847,130075..130566)) +# /gene="LOC729737" +# /product="hypothetical protein LOC729737" +# /note="Derived by automated computational analysis using +# gene prediction method: GNOMON. Supporting evidence +# includes similarity to: 2 mRNAs, 34 ESTs, 1 Protein" +# /transcript_id="AM_001133863.3" +# /db_xref="GI:310118477" + # /db_xref="Aeneid:729737" + my @output_fields = (@{$fields}{qw(gi accession accession_ver)}, + $variant, + $complement ? 'TRUE':'FALSE', + @{$fields}{qw(gene geneid)}, + @{$current_contig}{qw(gi chr)}, + ); + my $previous_stop = undef; + my $previous_start = undef; + my $number = 1; + my @exons = @{$extents}; + if ($complement) { + @exons = reverse @exons; + } + for my $exon (@exons) { + # strip off non-numerics like '>' and '<' here, + # because we don't want to deal with them + my ($start,$stop) = map {m/(\d+)/; $1;} split /\.\./,$exon; + if (not defined $stop) { + $stop = $start; + } + if (defined $previous_stop and $previous_stop > 0 and + defined $previous_start and $previous_start > 0 + ) { + my $intron_start = $previous_stop+1; + my $intron_stop = $start-1; + if ($complement) { + $intron_start = $stop+1; + $intron_stop = $previous_start-1; + } + print join("\t", + @output_fields, + 'FALSE', + $number-1, + $current_contig->{chr_start}+$intron_start-1, + $current_contig->{chr_start}+$intron_stop-1, + $intron_start, + $intron_stop, + )."\n"; + } + print join("\t", + @output_fields, + 'TRUE', + $number, + $current_contig->{chr_start}+$start-1, + $current_contig->{chr_start}+$stop-1, + $start, + $stop, + )."\n"; + $number++; + $previous_stop = $stop; + $previous_start = $start; + } +} + + +sub output_current_cds { + my ($extents,$variant,$complement,$fields,$current_contig) = @_; + # parse this out +# gi is 224514618 (start 10001, stop 267719) +# complement(join(126698..126805,126953..129696, +# 129790..129847,130075..130566)) +# /gene="LOC729737" +# /product="hypothetical protein LOC729737" +# /note="Derived by automated computational analysis using +# gene prediction method: GNOMON. Supporting evidence +# includes similarity to: 2 mRNAs, 34 ESTs, 1 Protein" +# /transcript_id="AM_001133863.3" +# /db_xref="GI:310118477" + # /db_xref="Aeneid:729737" + if (not exists $fields->{protein_id}) { + return; + } + my @output_fields = (@{$fields}{qw(gi accession accession_ver)}, + $variant, + $complement ? 'TRUE':'FALSE', + @{$fields}{qw(gene geneid)}, + @{$current_contig}{qw(gi chr)}, + ); + my $previous_stop = undef; + my $previous_start = undef; + my $number = 1; + my @exons = @{$extents}; + if ($complement) { + @exons = reverse @exons; + } + my $position = 0; + my $current_phase = 0; + for my $exon (@exons) { + # strip off non-numerics like '>' and '<' here, + # because we don't want to deal with them + my ($start,$stop) = map {m/(\d+)/; $1;} split /\.\./,$exon; + if (not defined $stop) { + $stop = $start; + } + print join("\t", + @output_fields, + $number, + $current_phase, + $current_contig->{chr_start}+$start-1, + $current_contig->{chr_start}+$stop-1, + $start, + $stop, + $position, + )."\n"; + $current_phase = ($current_phase + $stop - $start + 1) % 3; + $number++; + $previous_stop = $stop; + $previous_start = $start; + $position += abs($start-$stop)+1; + } +}