X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;ds=sidebyside;f=utils%2Fintron_exon_loader.pl;fp=utils%2Fintron_exon_loader.pl;h=99ace9de358d5aac71083e4d00db7f414ef5f0c9;hb=95ae8613ee3ff948d36ce633404102c3ad90881f;hp=0000000000000000000000000000000000000000;hpb=cccc00fef4cbece5fd2c5cf05a7a1182263fd53b;p=dbsnp.git diff --git a/utils/intron_exon_loader.pl b/utils/intron_exon_loader.pl new file mode 100755 index 0000000..99ace9d --- /dev/null +++ b/utils/intron_exon_loader.pl @@ -0,0 +1,333 @@ +#!/usr/bin/perl +# intron_exon_loader makes loadable mrna and cds files, and is +# released under the terms of the GPL version 2, or any later version, +# at your option. See the file README and COPYING for more +# information. +# Copyright 2011 by Don Armstrong . + +use warnings; +use strict; + +use Getopt::Long; +use Pod::Usage; + +=head1 NAME + +intron_exon_loader - Make SQL loadable mrna and cds files + +=head1 SYNOPSIS + + [options] + + Options: + --debug, -d debugging level (Default 0) + --help, -h display this help + --man, -m display manual + +=head1 OPTIONS + +=over + +=item B<--debug, -d> + +Debug verbosity. (Default 0) + +=item B<--help, -h> + +Display brief usage information. + +=item B<--man, -m> + +Display this manual. + +=back + +=head1 EXAMPLES + + zcat CHR_*/hs_ref_GRCh37.p2_*.gbs.gz | \ + intron_exon_loader| \ + psql snp -c "COPY mrna FROM STDIN WITH DELIMITER ' ' NULL AS 'NULL'"; + + zcat CHR_*/hs_ref_GRCh37.p2_*.gbs.gz | \ + intron_exon_loader --output cds| \ + psql snp -c "COPY cds FROM STDIN WITH DELIMITER ' ' NULL AS 'NULL'"; + + +=cut + +use vars qw($DEBUG); + +my %options = (type => 'mrna', + debug => 0, + help => 0, + man => 0, + ); + +GetOptions(\%options, + 'type|t=s', + 'debug|d+','help|h|?','man|m'); + +pod2usage() if $options{help}; +pod2usage({verbose=>2}) if $options{man}; + +$DEBUG = $options{debug}; + +my @USAGE_ERRORS; +if ($options{type} !~ /^(mrna|cds)$/) { + push @USAGE_ERRORS, "--type must be one of mrna or cds, not '$options{type}'"; +} +pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS; + +use DBI; +use Data::Dumper; + +my $dbh = DBI->connect("dbi:Pg:dbname=snp",'','',{AutoCommit => 0}) or + die "Unable to connect to database".$DBI::errstr; + +my $sth = $dbh->prepare(<<'END') // die "Unable to prepare snpinfo statement: ".$dbh->errstr; +SELECT * FROM contigs where gi=?; +END + +sub main { + my ($options) = @_; + my %current_contig; + my $in_stanza = 0; + my $skip_until_next_contig; + my $current_stanza = ''; + my $type = lc($options{type}); + my $stanza_type = $type eq 'mrna' ? 'mRNA' : 'CDS'; + while (<>) { + chomp; + if (/^VERSION/) { + $skip_until_next_contig = 0; + my ($acc,$ver,$gi) = $_ =~ m/^VERSION\s+([A-Z0-9_]+)\.(\d+)\s+GI\:(\d+)/; + if (not defined $gi) { + die "No GI in '$_' on line $."; + } + %current_contig = (acc => $acc, + ver => $ver, + gi => $gi, + ); + %current_contig = contig_information($dbh,$sth,%current_contig); + if (not defined $current_contig{chr_start}) { + $skip_until_next_contig = 1; + } + } + next if $skip_until_next_contig; + if (/^\s{5}(\S+)/) { + if ($1 eq $stanza_type) { + if (length $current_stanza and $in_stanza) { + output_current_stanza($type,$current_stanza,\%current_contig); + $current_stanza = ''; + $in_stanza = 0; + } + $in_stanza = 1; + s/^\s{5}\S+//; + s/^\s+//; + $current_stanza = $_."\n"; + } elsif (length $current_stanza) { + output_current_stanza($type,$current_stanza,\%current_contig); + $current_stanza = ''; + $in_stanza = 0; + } else { + $in_stanza = 0; + } + } + elsif (/^[A-Z]/ and length $current_stanza) { + output_current_stanza($type,$current_stanza,\%current_contig); + $current_stanza = ''; + $in_stanza = 0; + } else { + if ($in_stanza) { + $current_stanza .= $_."\n"; + } + } + } +} + +main(); + +sub contig_information{ + my ($dbh,$sth,%current_contig) = @_; + + my $rv = $sth->execute($current_contig{gi}) or + die "Unable to execute statement properly: ".$dbh->errstr; + + my $results = $sth->fetchall_hashref('gi'); + if ($sth->err) { + print STDERR $sth->errstr; + } + if (exists $results->{$current_contig{gi}}) { + %current_contig = (%current_contig, + %{$results->{$current_contig{gi}}}, + ); + } + return %current_contig; +} + +sub output_current_stanza{ + my ($type,$current_stanza,$current_contig) = @_; + $current_stanza =~ s/^[\s\t]+//mg; + $current_stanza =~ s/\n\s*([^\/])/ $1/mg; + my ($extent,@other_fields) = split /\n/,$current_stanza; + $extent =~ s/\s+//g; + # print "extent: $extent\n"; + # figure out extent + my $complement = $extent =~ m/complement/; + $extent =~ s/complement\((.+)\)$/$1/; + # strip out the join, because we don't care about it + $extent =~ s/join\((.+)\)$/$1/; + # handle fields + my %fields = map {my ($key,$value) = $_ =~ m{\/([^=]+)="?(.+?)"?$}s; + (not defined $key)?():(($key eq 'db_xref' && $value =~ m/([^:]+):(.+)/)?(lc($1),$2):(lc($key),$value))} @other_fields; + return if not exists $fields{product}; + if (exists $fields{transcript_id}) { + ($fields{accession},$fields{accession_ver}) = $fields{transcript_id} =~ m/(.+)\.(\d+)/; + } + elsif (exists $fields{protein_id}) { + ($fields{accession},$fields{accession_ver}) = $fields{protein_id} =~ m/(.+)\.(\d+)/; + } + else { + return; + } + my ($variant) = $fields{product} =~ m/variant\s*(\d+)/i; + if (not defined $variant or not length $variant) { + ($variant) = $fields{product} =~ m/isoform\s*([^\s]+)/; + } + if (not defined $variant or not length $variant) { # or $variant !~ /^\d+$/) { + $variant = 'NULL'; + } + my @extents = split /\s*\,\s*/,$extent; + if ($type eq 'mrna') { + output_current_mrna(\@extents,$variant,$complement,\%fields,$current_contig); + } + else { + output_current_cds(\@extents,$variant,$complement,\%fields,$current_contig); + } +} + +sub output_current_mrna{ + my ($extents,$variant,$complement,$fields,$current_contig) = @_; + # parse this out +# gi is 224514618 (start 10001, stop 267719) +# complement(join(126698..126805,126953..129696, +# 129790..129847,130075..130566)) +# /gene="LOC729737" +# /product="hypothetical protein LOC729737" +# /note="Derived by automated computational analysis using +# gene prediction method: GNOMON. Supporting evidence +# includes similarity to: 2 mRNAs, 34 ESTs, 1 Protein" +# /transcript_id="AM_001133863.3" +# /db_xref="GI:310118477" + # /db_xref="Aeneid:729737" + my @output_fields = (@{$fields}{qw(gi accession accession_ver)}, + $variant, + $complement ? 'TRUE':'FALSE', + @{$fields}{qw(gene geneid)}, + @{$current_contig}{qw(gi chr)}, + ); + my $previous_stop = undef; + my $previous_start = undef; + my $number = 1; + my @exons = @{$extents}; + if ($complement) { + @exons = reverse @exons; + } + for my $exon (@exons) { + # strip off non-numerics like '>' and '<' here, + # because we don't want to deal with them + my ($start,$stop) = map {m/(\d+)/; $1;} split /\.\./,$exon; + if (not defined $stop) { + $stop = $start; + } + if (defined $previous_stop and $previous_stop > 0 and + defined $previous_start and $previous_start > 0 + ) { + my $intron_start = $previous_stop+1; + my $intron_stop = $start-1; + if ($complement) { + $intron_start = $stop+1; + $intron_stop = $previous_start-1; + } + print join("\t", + @output_fields, + 'FALSE', + $number-1, + $current_contig->{chr_start}+$intron_start-1, + $current_contig->{chr_start}+$intron_stop-1, + $intron_start, + $intron_stop, + )."\n"; + } + print join("\t", + @output_fields, + 'TRUE', + $number, + $current_contig->{chr_start}+$start-1, + $current_contig->{chr_start}+$stop-1, + $start, + $stop, + )."\n"; + $number++; + $previous_stop = $stop; + $previous_start = $start; + } +} + + +sub output_current_cds { + my ($extents,$variant,$complement,$fields,$current_contig) = @_; + # parse this out +# gi is 224514618 (start 10001, stop 267719) +# complement(join(126698..126805,126953..129696, +# 129790..129847,130075..130566)) +# /gene="LOC729737" +# /product="hypothetical protein LOC729737" +# /note="Derived by automated computational analysis using +# gene prediction method: GNOMON. Supporting evidence +# includes similarity to: 2 mRNAs, 34 ESTs, 1 Protein" +# /transcript_id="AM_001133863.3" +# /db_xref="GI:310118477" + # /db_xref="Aeneid:729737" + if (not exists $fields->{protein_id}) { + return; + } + my @output_fields = (@{$fields}{qw(gi accession accession_ver)}, + $variant, + $complement ? 'TRUE':'FALSE', + @{$fields}{qw(gene geneid)}, + @{$current_contig}{qw(gi chr)}, + ); + my $previous_stop = undef; + my $previous_start = undef; + my $number = 1; + my @exons = @{$extents}; + if ($complement) { + @exons = reverse @exons; + } + my $position = 0; + my $current_phase = 0; + for my $exon (@exons) { + # strip off non-numerics like '>' and '<' here, + # because we don't want to deal with them + my ($start,$stop) = map {m/(\d+)/; $1;} split /\.\./,$exon; + if (not defined $stop) { + $stop = $start; + } + print join("\t", + @output_fields, + $number, + $current_phase, + $current_contig->{chr_start}+$start-1, + $current_contig->{chr_start}+$stop-1, + $start, + $stop, + $position, + )."\n"; + $current_phase = ($current_phase + $stop - $start + 1) % 3; + $number++; + $previous_stop = $stop; + $previous_start = $start; + $position += abs($start-$stop)+1; + } +}