]> git.donarmstrong.com Git - dbsnp.git/blobdiff - utils/intron_exon_loader.pl
add intron/exon schema and mrna cds loader
[dbsnp.git] / utils / intron_exon_loader.pl
diff --git a/utils/intron_exon_loader.pl b/utils/intron_exon_loader.pl
new file mode 100755 (executable)
index 0000000..99ace9d
--- /dev/null
@@ -0,0 +1,333 @@
+#!/usr/bin/perl
+# intron_exon_loader makes loadable mrna and cds files, and is
+# released under the terms of the GPL version 2, or any later version,
+# at your option. See the file README and COPYING for more
+# information.
+# Copyright 2011 by Don Armstrong <don@donarmstrong.com>.
+
+use warnings;
+use strict;
+
+use Getopt::Long;
+use Pod::Usage;
+
+=head1 NAME
+
+intron_exon_loader - Make SQL loadable mrna and cds files
+
+=head1 SYNOPSIS
+
+ [options]
+
+ Options:
+  --debug, -d debugging level (Default 0)
+  --help, -h display this help
+  --man, -m display manual
+
+=head1 OPTIONS
+
+=over
+
+=item B<--debug, -d>
+
+Debug verbosity. (Default 0)
+
+=item B<--help, -h>
+
+Display brief usage information.
+
+=item B<--man, -m>
+
+Display this manual.
+
+=back
+
+=head1 EXAMPLES
+
+ zcat CHR_*/hs_ref_GRCh37.p2_*.gbs.gz | \
+ intron_exon_loader| \
+ psql snp -c "COPY mrna FROM STDIN WITH DELIMITER '   ' NULL AS 'NULL'";
+
+ zcat CHR_*/hs_ref_GRCh37.p2_*.gbs.gz | \
+ intron_exon_loader --output cds| \
+ psql snp -c "COPY cds FROM STDIN WITH DELIMITER '   ' NULL AS 'NULL'";
+
+
+=cut
+
+use vars qw($DEBUG);
+
+my %options = (type            => 'mrna',
+              debug           => 0,
+              help            => 0,
+              man             => 0,
+              );
+
+GetOptions(\%options,
+          'type|t=s',
+          'debug|d+','help|h|?','man|m');
+
+pod2usage() if $options{help};
+pod2usage({verbose=>2}) if $options{man};
+
+$DEBUG = $options{debug};
+
+my @USAGE_ERRORS;
+if ($options{type} !~ /^(mrna|cds)$/) {
+    push @USAGE_ERRORS, "--type must be one of mrna or cds, not '$options{type}'";
+}
+pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
+
+use DBI;
+use Data::Dumper;
+
+my $dbh = DBI->connect("dbi:Pg:dbname=snp",'','',{AutoCommit => 0}) or
+    die "Unable to connect to database".$DBI::errstr;
+
+my $sth = $dbh->prepare(<<'END') // die "Unable to prepare snpinfo statement: ".$dbh->errstr;
+SELECT * FROM contigs where gi=?;
+END
+
+sub main {
+    my ($options) = @_;
+    my %current_contig;
+    my $in_stanza = 0;
+    my $skip_until_next_contig;
+    my $current_stanza = '';
+    my $type = lc($options{type});
+    my $stanza_type = $type eq 'mrna' ? 'mRNA' : 'CDS';
+    while (<>) {
+       chomp;
+       if (/^VERSION/) {
+           $skip_until_next_contig = 0;
+           my ($acc,$ver,$gi) = $_ =~ m/^VERSION\s+([A-Z0-9_]+)\.(\d+)\s+GI\:(\d+)/;
+           if (not defined $gi) {
+               die "No GI in '$_' on line $.";
+           }
+           %current_contig = (acc => $acc,
+                              ver => $ver,
+                              gi  => $gi,
+                             );
+           %current_contig = contig_information($dbh,$sth,%current_contig);
+           if (not defined $current_contig{chr_start}) {
+               $skip_until_next_contig = 1;
+           }
+       }
+       next if $skip_until_next_contig;
+       if (/^\s{5}(\S+)/) {
+           if ($1 eq $stanza_type) {
+               if (length $current_stanza and $in_stanza) {
+                   output_current_stanza($type,$current_stanza,\%current_contig);
+                   $current_stanza = '';
+                   $in_stanza = 0;
+               }
+               $in_stanza = 1;
+               s/^\s{5}\S+//;
+               s/^\s+//;
+               $current_stanza = $_."\n";
+           } elsif (length $current_stanza) {
+               output_current_stanza($type,$current_stanza,\%current_contig);
+               $current_stanza = '';
+               $in_stanza = 0;
+           } else {
+               $in_stanza = 0;
+           }
+       }
+       elsif (/^[A-Z]/ and length $current_stanza) {
+           output_current_stanza($type,$current_stanza,\%current_contig);
+           $current_stanza = '';
+           $in_stanza = 0;
+       } else {
+           if ($in_stanza) {
+               $current_stanza .= $_."\n";
+           }
+       }
+    }
+}
+
+main();
+
+sub contig_information{
+    my ($dbh,$sth,%current_contig) = @_;
+
+    my $rv = $sth->execute($current_contig{gi}) or
+       die "Unable to execute statement properly: ".$dbh->errstr;
+
+    my $results = $sth->fetchall_hashref('gi');
+    if ($sth->err) {
+       print STDERR $sth->errstr;
+    }
+    if (exists $results->{$current_contig{gi}}) {
+       %current_contig = (%current_contig,
+                          %{$results->{$current_contig{gi}}},
+                         );
+    }
+    return %current_contig;
+}
+
+sub output_current_stanza{
+    my ($type,$current_stanza,$current_contig) = @_;
+    $current_stanza =~ s/^[\s\t]+//mg;
+    $current_stanza =~ s/\n\s*([^\/])/ $1/mg;
+    my ($extent,@other_fields) = split /\n/,$current_stanza;
+    $extent =~ s/\s+//g;
+    #      print "extent: $extent\n";
+    # figure out extent
+    my $complement = $extent =~ m/complement/;
+    $extent =~ s/complement\((.+)\)$/$1/;
+    # strip out the join, because we don't care about it
+    $extent =~ s/join\((.+)\)$/$1/;
+    # handle fields
+    my %fields = map {my ($key,$value) = $_ =~ m{\/([^=]+)="?(.+?)"?$}s;
+                     (not defined $key)?():(($key eq 'db_xref' && $value =~ m/([^:]+):(.+)/)?(lc($1),$2):(lc($key),$value))} @other_fields;
+    return if not exists $fields{product};
+    if (exists $fields{transcript_id}) {
+       ($fields{accession},$fields{accession_ver}) = $fields{transcript_id} =~ m/(.+)\.(\d+)/;
+    }
+    elsif (exists $fields{protein_id}) {
+       ($fields{accession},$fields{accession_ver}) = $fields{protein_id} =~ m/(.+)\.(\d+)/;
+    }
+    else {
+       return;
+    }
+    my ($variant) = $fields{product} =~ m/variant\s*(\d+)/i;
+    if (not defined $variant or not length $variant) {
+       ($variant) = $fields{product} =~ m/isoform\s*([^\s]+)/;
+    }
+    if (not defined $variant or not length $variant) { # or $variant !~ /^\d+$/) {
+       $variant = 'NULL';
+    }
+    my @extents = split /\s*\,\s*/,$extent;
+    if ($type eq 'mrna') {
+       output_current_mrna(\@extents,$variant,$complement,\%fields,$current_contig);
+    }
+    else {
+       output_current_cds(\@extents,$variant,$complement,\%fields,$current_contig);
+    }
+}
+
+sub output_current_mrna{
+    my ($extents,$variant,$complement,$fields,$current_contig) = @_;
+           # parse this out
+# gi is 224514618 (start 10001, stop 267719)
+#          complement(join(126698..126805,126953..129696,
+#                      129790..129847,130075..130566))
+#                      /gene="LOC729737"
+#                      /product="hypothetical protein LOC729737"
+#                      /note="Derived by automated computational analysis using
+#                      gene prediction method: GNOMON. Supporting evidence
+#                      includes similarity to: 2 mRNAs, 34 ESTs, 1 Protein"
+#                      /transcript_id="AM_001133863.3"
+#                      /db_xref="GI:310118477"
+    #                      /db_xref="Aeneid:729737"
+    my @output_fields = (@{$fields}{qw(gi accession accession_ver)},
+                        $variant,
+                        $complement ? 'TRUE':'FALSE',
+                        @{$fields}{qw(gene geneid)},
+                        @{$current_contig}{qw(gi chr)},
+                       );
+    my $previous_stop = undef;
+    my $previous_start = undef;
+    my $number = 1;
+    my @exons = @{$extents};
+    if ($complement) {
+       @exons = reverse @exons;
+    }
+    for my $exon (@exons) {
+       # strip off non-numerics like '>' and '<' here,
+       # because we don't want to deal with them
+       my ($start,$stop) = map {m/(\d+)/; $1;} split /\.\./,$exon;
+       if (not defined $stop) {
+           $stop = $start;
+       }
+       if (defined $previous_stop and $previous_stop > 0 and
+           defined $previous_start and $previous_start > 0
+          ) {
+           my $intron_start = $previous_stop+1;
+           my $intron_stop = $start-1;
+           if ($complement) {
+               $intron_start = $stop+1;
+               $intron_stop = $previous_start-1;
+           }
+           print join("\t",
+                      @output_fields,
+                      'FALSE',
+                      $number-1,
+                      $current_contig->{chr_start}+$intron_start-1,
+                      $current_contig->{chr_start}+$intron_stop-1,
+                      $intron_start,
+                      $intron_stop,
+                     )."\n";
+       }
+       print join("\t",
+                  @output_fields,
+                  'TRUE',
+                  $number,
+                  $current_contig->{chr_start}+$start-1,
+                  $current_contig->{chr_start}+$stop-1,
+                  $start,
+                  $stop,
+                 )."\n";
+       $number++;
+       $previous_stop = $stop;
+       $previous_start = $start;
+    }
+}
+
+
+sub output_current_cds {
+    my ($extents,$variant,$complement,$fields,$current_contig) = @_;
+           # parse this out
+# gi is 224514618 (start 10001, stop 267719)
+#          complement(join(126698..126805,126953..129696,
+#                      129790..129847,130075..130566))
+#                      /gene="LOC729737"
+#                      /product="hypothetical protein LOC729737"
+#                      /note="Derived by automated computational analysis using
+#                      gene prediction method: GNOMON. Supporting evidence
+#                      includes similarity to: 2 mRNAs, 34 ESTs, 1 Protein"
+#                      /transcript_id="AM_001133863.3"
+#                      /db_xref="GI:310118477"
+    #                      /db_xref="Aeneid:729737"
+    if (not exists $fields->{protein_id}) {
+       return;
+    }
+    my @output_fields = (@{$fields}{qw(gi accession accession_ver)},
+                        $variant,
+                        $complement ? 'TRUE':'FALSE',
+                        @{$fields}{qw(gene geneid)},
+                        @{$current_contig}{qw(gi chr)},
+                       );
+    my $previous_stop = undef;
+    my $previous_start = undef;
+    my $number = 1;
+    my @exons = @{$extents};
+    if ($complement) {
+       @exons = reverse @exons;
+    }
+    my $position = 0;
+    my $current_phase = 0;
+    for my $exon (@exons) {
+       # strip off non-numerics like '>' and '<' here,
+       # because we don't want to deal with them
+       my ($start,$stop) = map {m/(\d+)/; $1;} split /\.\./,$exon;
+       if (not defined $stop) {
+           $stop = $start;
+       }
+       print join("\t",
+                  @output_fields,
+                  $number,
+                  $current_phase,
+                  $current_contig->{chr_start}+$start-1,
+                  $current_contig->{chr_start}+$stop-1,
+                  $start,
+                  $stop,
+                  $position,
+                 )."\n";
+       $current_phase = ($current_phase + $stop - $start + 1) % 3;
+       $number++;
+       $previous_stop = $stop;
+       $previous_start = $start;
+       $position += abs($start-$stop)+1;
+    }
+}