--- /dev/null
+#!/usr/bin/perl
+# intron_exon_loader makes loadable mrna and cds files, and is
+# released under the terms of the GPL version 2, or any later version,
+# at your option. See the file README and COPYING for more
+# information.
+# Copyright 2011 by Don Armstrong <don@donarmstrong.com>.
+
+use warnings;
+use strict;
+
+use Getopt::Long;
+use Pod::Usage;
+
+=head1 NAME
+
+intron_exon_loader - Make SQL loadable mrna and cds files
+
+=head1 SYNOPSIS
+
+ [options]
+
+ Options:
+ --debug, -d debugging level (Default 0)
+ --help, -h display this help
+ --man, -m display manual
+
+=head1 OPTIONS
+
+=over
+
+=item B<--debug, -d>
+
+Debug verbosity. (Default 0)
+
+=item B<--help, -h>
+
+Display brief usage information.
+
+=item B<--man, -m>
+
+Display this manual.
+
+=back
+
+=head1 EXAMPLES
+
+ zcat CHR_*/hs_ref_GRCh37.p2_*.gbs.gz | \
+ intron_exon_loader| \
+ psql snp -c "COPY mrna FROM STDIN WITH DELIMITER ' ' NULL AS 'NULL'";
+
+ zcat CHR_*/hs_ref_GRCh37.p2_*.gbs.gz | \
+ intron_exon_loader --output cds| \
+ psql snp -c "COPY cds FROM STDIN WITH DELIMITER ' ' NULL AS 'NULL'";
+
+
+=cut
+
+use vars qw($DEBUG);
+
+my %options = (type => 'mrna',
+ debug => 0,
+ help => 0,
+ man => 0,
+ );
+
+GetOptions(\%options,
+ 'type|t=s',
+ 'debug|d+','help|h|?','man|m');
+
+pod2usage() if $options{help};
+pod2usage({verbose=>2}) if $options{man};
+
+$DEBUG = $options{debug};
+
+my @USAGE_ERRORS;
+if ($options{type} !~ /^(mrna|cds)$/) {
+ push @USAGE_ERRORS, "--type must be one of mrna or cds, not '$options{type}'";
+}
+pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
+
+use DBI;
+use Data::Dumper;
+
+my $dbh = DBI->connect("dbi:Pg:dbname=snp",'','',{AutoCommit => 0}) or
+ die "Unable to connect to database".$DBI::errstr;
+
+my $sth = $dbh->prepare(<<'END') // die "Unable to prepare snpinfo statement: ".$dbh->errstr;
+SELECT * FROM contigs where gi=?;
+END
+
+sub main {
+ my ($options) = @_;
+ my %current_contig;
+ my $in_stanza = 0;
+ my $skip_until_next_contig;
+ my $current_stanza = '';
+ my $type = lc($options{type});
+ my $stanza_type = $type eq 'mrna' ? 'mRNA' : 'CDS';
+ while (<>) {
+ chomp;
+ if (/^VERSION/) {
+ $skip_until_next_contig = 0;
+ my ($acc,$ver,$gi) = $_ =~ m/^VERSION\s+([A-Z0-9_]+)\.(\d+)\s+GI\:(\d+)/;
+ if (not defined $gi) {
+ die "No GI in '$_' on line $.";
+ }
+ %current_contig = (acc => $acc,
+ ver => $ver,
+ gi => $gi,
+ );
+ %current_contig = contig_information($dbh,$sth,%current_contig);
+ if (not defined $current_contig{chr_start}) {
+ $skip_until_next_contig = 1;
+ }
+ }
+ next if $skip_until_next_contig;
+ if (/^\s{5}(\S+)/) {
+ if ($1 eq $stanza_type) {
+ if (length $current_stanza and $in_stanza) {
+ output_current_stanza($type,$current_stanza,\%current_contig);
+ $current_stanza = '';
+ $in_stanza = 0;
+ }
+ $in_stanza = 1;
+ s/^\s{5}\S+//;
+ s/^\s+//;
+ $current_stanza = $_."\n";
+ } elsif (length $current_stanza) {
+ output_current_stanza($type,$current_stanza,\%current_contig);
+ $current_stanza = '';
+ $in_stanza = 0;
+ } else {
+ $in_stanza = 0;
+ }
+ }
+ elsif (/^[A-Z]/ and length $current_stanza) {
+ output_current_stanza($type,$current_stanza,\%current_contig);
+ $current_stanza = '';
+ $in_stanza = 0;
+ } else {
+ if ($in_stanza) {
+ $current_stanza .= $_."\n";
+ }
+ }
+ }
+}
+
+main();
+
+sub contig_information{
+ my ($dbh,$sth,%current_contig) = @_;
+
+ my $rv = $sth->execute($current_contig{gi}) or
+ die "Unable to execute statement properly: ".$dbh->errstr;
+
+ my $results = $sth->fetchall_hashref('gi');
+ if ($sth->err) {
+ print STDERR $sth->errstr;
+ }
+ if (exists $results->{$current_contig{gi}}) {
+ %current_contig = (%current_contig,
+ %{$results->{$current_contig{gi}}},
+ );
+ }
+ return %current_contig;
+}
+
+sub output_current_stanza{
+ my ($type,$current_stanza,$current_contig) = @_;
+ $current_stanza =~ s/^[\s\t]+//mg;
+ $current_stanza =~ s/\n\s*([^\/])/ $1/mg;
+ my ($extent,@other_fields) = split /\n/,$current_stanza;
+ $extent =~ s/\s+//g;
+ # print "extent: $extent\n";
+ # figure out extent
+ my $complement = $extent =~ m/complement/;
+ $extent =~ s/complement\((.+)\)$/$1/;
+ # strip out the join, because we don't care about it
+ $extent =~ s/join\((.+)\)$/$1/;
+ # handle fields
+ my %fields = map {my ($key,$value) = $_ =~ m{\/([^=]+)="?(.+?)"?$}s;
+ (not defined $key)?():(($key eq 'db_xref' && $value =~ m/([^:]+):(.+)/)?(lc($1),$2):(lc($key),$value))} @other_fields;
+ return if not exists $fields{product};
+ if (exists $fields{transcript_id}) {
+ ($fields{accession},$fields{accession_ver}) = $fields{transcript_id} =~ m/(.+)\.(\d+)/;
+ }
+ elsif (exists $fields{protein_id}) {
+ ($fields{accession},$fields{accession_ver}) = $fields{protein_id} =~ m/(.+)\.(\d+)/;
+ }
+ else {
+ return;
+ }
+ my ($variant) = $fields{product} =~ m/variant\s*(\d+)/i;
+ if (not defined $variant or not length $variant) {
+ ($variant) = $fields{product} =~ m/isoform\s*([^\s]+)/;
+ }
+ if (not defined $variant or not length $variant) { # or $variant !~ /^\d+$/) {
+ $variant = 'NULL';
+ }
+ my @extents = split /\s*\,\s*/,$extent;
+ if ($type eq 'mrna') {
+ output_current_mrna(\@extents,$variant,$complement,\%fields,$current_contig);
+ }
+ else {
+ output_current_cds(\@extents,$variant,$complement,\%fields,$current_contig);
+ }
+}
+
+sub output_current_mrna{
+ my ($extents,$variant,$complement,$fields,$current_contig) = @_;
+ # parse this out
+# gi is 224514618 (start 10001, stop 267719)
+# complement(join(126698..126805,126953..129696,
+# 129790..129847,130075..130566))
+# /gene="LOC729737"
+# /product="hypothetical protein LOC729737"
+# /note="Derived by automated computational analysis using
+# gene prediction method: GNOMON. Supporting evidence
+# includes similarity to: 2 mRNAs, 34 ESTs, 1 Protein"
+# /transcript_id="AM_001133863.3"
+# /db_xref="GI:310118477"
+ # /db_xref="Aeneid:729737"
+ my @output_fields = (@{$fields}{qw(gi accession accession_ver)},
+ $variant,
+ $complement ? 'TRUE':'FALSE',
+ @{$fields}{qw(gene geneid)},
+ @{$current_contig}{qw(gi chr)},
+ );
+ my $previous_stop = undef;
+ my $previous_start = undef;
+ my $number = 1;
+ my @exons = @{$extents};
+ if ($complement) {
+ @exons = reverse @exons;
+ }
+ for my $exon (@exons) {
+ # strip off non-numerics like '>' and '<' here,
+ # because we don't want to deal with them
+ my ($start,$stop) = map {m/(\d+)/; $1;} split /\.\./,$exon;
+ if (not defined $stop) {
+ $stop = $start;
+ }
+ if (defined $previous_stop and $previous_stop > 0 and
+ defined $previous_start and $previous_start > 0
+ ) {
+ my $intron_start = $previous_stop+1;
+ my $intron_stop = $start-1;
+ if ($complement) {
+ $intron_start = $stop+1;
+ $intron_stop = $previous_start-1;
+ }
+ print join("\t",
+ @output_fields,
+ 'FALSE',
+ $number-1,
+ $current_contig->{chr_start}+$intron_start-1,
+ $current_contig->{chr_start}+$intron_stop-1,
+ $intron_start,
+ $intron_stop,
+ )."\n";
+ }
+ print join("\t",
+ @output_fields,
+ 'TRUE',
+ $number,
+ $current_contig->{chr_start}+$start-1,
+ $current_contig->{chr_start}+$stop-1,
+ $start,
+ $stop,
+ )."\n";
+ $number++;
+ $previous_stop = $stop;
+ $previous_start = $start;
+ }
+}
+
+
+sub output_current_cds {
+ my ($extents,$variant,$complement,$fields,$current_contig) = @_;
+ # parse this out
+# gi is 224514618 (start 10001, stop 267719)
+# complement(join(126698..126805,126953..129696,
+# 129790..129847,130075..130566))
+# /gene="LOC729737"
+# /product="hypothetical protein LOC729737"
+# /note="Derived by automated computational analysis using
+# gene prediction method: GNOMON. Supporting evidence
+# includes similarity to: 2 mRNAs, 34 ESTs, 1 Protein"
+# /transcript_id="AM_001133863.3"
+# /db_xref="GI:310118477"
+ # /db_xref="Aeneid:729737"
+ if (not exists $fields->{protein_id}) {
+ return;
+ }
+ my @output_fields = (@{$fields}{qw(gi accession accession_ver)},
+ $variant,
+ $complement ? 'TRUE':'FALSE',
+ @{$fields}{qw(gene geneid)},
+ @{$current_contig}{qw(gi chr)},
+ );
+ my $previous_stop = undef;
+ my $previous_start = undef;
+ my $number = 1;
+ my @exons = @{$extents};
+ if ($complement) {
+ @exons = reverse @exons;
+ }
+ my $position = 0;
+ my $current_phase = 0;
+ for my $exon (@exons) {
+ # strip off non-numerics like '>' and '<' here,
+ # because we don't want to deal with them
+ my ($start,$stop) = map {m/(\d+)/; $1;} split /\.\./,$exon;
+ if (not defined $stop) {
+ $stop = $start;
+ }
+ print join("\t",
+ @output_fields,
+ $number,
+ $current_phase,
+ $current_contig->{chr_start}+$start-1,
+ $current_contig->{chr_start}+$stop-1,
+ $start,
+ $stop,
+ $position,
+ )."\n";
+ $current_phase = ($current_phase + $stop - $start + 1) % 3;
+ $number++;
+ $previous_stop = $stop;
+ $previous_start = $start;
+ $position += abs($start-$stop)+1;
+ }
+}