2 # intron_exon_loader makes loadable mrna and cds files, and is
3 # released under the terms of the GPL version 2, or any later version,
4 # at your option. See the file README and COPYING for more
6 # Copyright 2011 by Don Armstrong <don@donarmstrong.com>.
16 intron_exon_loader - Make SQL loadable mrna and cds files
23 --debug, -d debugging level (Default 0)
24 --help, -h display this help
25 --man, -m display manual
33 Debug verbosity. (Default 0)
37 Display brief usage information.
47 zcat CHR_*/hs_ref_GRCh37.p2_*.gbs.gz | \
49 psql snp -c "COPY mrna FROM STDIN WITH DELIMITER ' ' NULL AS 'NULL'";
51 zcat CHR_*/hs_ref_GRCh37.p2_*.gbs.gz | \
52 intron_exon_loader --output cds| \
53 psql snp -c "COPY cds FROM STDIN WITH DELIMITER ' ' NULL AS 'NULL'";
60 my %options = (type => 'mrna',
68 'debug|d+','help|h|?','man|m');
70 pod2usage() if $options{help};
71 pod2usage({verbose=>2}) if $options{man};
73 $DEBUG = $options{debug};
76 if ($options{type} !~ /^(mrna|cds)$/) {
77 push @USAGE_ERRORS, "--type must be one of mrna or cds, not '$options{type}'";
79 pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
84 my $dbh = DBI->connect("dbi:Pg:dbname=snp",'','',{AutoCommit => 0}) or
85 die "Unable to connect to database".$DBI::errstr;
87 my $sth = $dbh->prepare(<<'END') // die "Unable to prepare snpinfo statement: ".$dbh->errstr;
88 SELECT * FROM contigs where gi=?;
95 my $skip_until_next_contig;
96 my $current_stanza = '';
97 my $type = lc($options{type});
98 my $stanza_type = $type eq 'mrna' ? 'mRNA' : 'CDS';
102 $skip_until_next_contig = 0;
103 my ($acc,$ver,$gi) = $_ =~ m/^VERSION\s+([A-Z0-9_]+)\.(\d+)\s+GI\:(\d+)/;
104 if (not defined $gi) {
105 die "No GI in '$_' on line $.";
107 %current_contig = (acc => $acc,
111 %current_contig = contig_information($dbh,$sth,%current_contig);
112 if (not defined $current_contig{chr_start}) {
113 $skip_until_next_contig = 1;
116 next if $skip_until_next_contig;
118 if ($1 eq $stanza_type) {
119 if (length $current_stanza and $in_stanza) {
120 output_current_stanza($type,$current_stanza,\%current_contig);
121 $current_stanza = '';
127 $current_stanza = $_."\n";
128 } elsif (length $current_stanza) {
129 output_current_stanza($type,$current_stanza,\%current_contig);
130 $current_stanza = '';
136 elsif (/^[A-Z]/ and length $current_stanza) {
137 output_current_stanza($type,$current_stanza,\%current_contig);
138 $current_stanza = '';
142 $current_stanza .= $_."\n";
150 sub contig_information{
151 my ($dbh,$sth,%current_contig) = @_;
153 my $rv = $sth->execute($current_contig{gi}) or
154 die "Unable to execute statement properly: ".$dbh->errstr;
156 my $results = $sth->fetchall_hashref('gi');
158 print STDERR $sth->errstr;
160 if (exists $results->{$current_contig{gi}}) {
161 %current_contig = (%current_contig,
162 %{$results->{$current_contig{gi}}},
165 return %current_contig;
168 sub output_current_stanza{
169 my ($type,$current_stanza,$current_contig) = @_;
170 $current_stanza =~ s/^[\s\t]+//mg;
171 $current_stanza =~ s/\n\s*([^\/])/ $1/mg;
172 my ($extent,@other_fields) = split /\n/,$current_stanza;
174 # print "extent: $extent\n";
176 my $complement = $extent =~ m/complement/;
177 $extent =~ s/complement\((.+)\)$/$1/;
178 # strip out the join, because we don't care about it
179 $extent =~ s/join\((.+)\)$/$1/;
181 my %fields = map {my ($key,$value) = $_ =~ m{\/([^=]+)="?(.+?)"?$}s;
182 (not defined $key)?():(($key eq 'db_xref' && $value =~ m/([^:]+):(.+)/)?(lc($1),$2):(lc($key),$value))} @other_fields;
183 return if not exists $fields{product};
184 if (exists $fields{transcript_id}) {
185 ($fields{accession},$fields{accession_ver}) = $fields{transcript_id} =~ m/(.+)\.(\d+)/;
187 elsif (exists $fields{protein_id}) {
188 ($fields{accession},$fields{accession_ver}) = $fields{protein_id} =~ m/(.+)\.(\d+)/;
193 my ($variant) = $fields{product} =~ m/variant\s*(\d+)/i;
194 if (not defined $variant or not length $variant) {
195 ($variant) = $fields{product} =~ m/isoform\s*([^\s]+)/;
197 if (not defined $variant or not length $variant) { # or $variant !~ /^\d+$/) {
200 my @extents = split /\s*\,\s*/,$extent;
201 if ($type eq 'mrna') {
202 output_current_mrna(\@extents,$variant,$complement,\%fields,$current_contig);
205 output_current_cds(\@extents,$variant,$complement,\%fields,$current_contig);
209 sub output_current_mrna{
210 my ($extents,$variant,$complement,$fields,$current_contig) = @_;
212 # gi is 224514618 (start 10001, stop 267719)
213 # complement(join(126698..126805,126953..129696,
214 # 129790..129847,130075..130566))
216 # /product="hypothetical protein LOC729737"
217 # /note="Derived by automated computational analysis using
218 # gene prediction method: GNOMON. Supporting evidence
219 # includes similarity to: 2 mRNAs, 34 ESTs, 1 Protein"
220 # /transcript_id="AM_001133863.3"
221 # /db_xref="GI:310118477"
222 # /db_xref="Aeneid:729737"
223 my @output_fields = (@{$fields}{qw(gi accession accession_ver)},
225 $complement ? 'TRUE':'FALSE',
226 @{$fields}{qw(gene geneid)},
227 @{$current_contig}{qw(gi chr)},
229 my $previous_stop = undef;
230 my $previous_start = undef;
232 my @exons = @{$extents};
234 @exons = reverse @exons;
236 for my $exon (@exons) {
237 # strip off non-numerics like '>' and '<' here,
238 # because we don't want to deal with them
239 my ($start,$stop) = map {m/(\d+)/; $1;} split /\.\./,$exon;
240 if (not defined $stop) {
243 if (defined $previous_stop and $previous_stop > 0 and
244 defined $previous_start and $previous_start > 0
246 my $intron_start = $previous_stop+1;
247 my $intron_stop = $start-1;
249 $intron_start = $stop+1;
250 $intron_stop = $previous_start-1;
256 $current_contig->{chr_start}+$intron_start-1,
257 $current_contig->{chr_start}+$intron_stop-1,
266 $current_contig->{chr_start}+$start-1,
267 $current_contig->{chr_start}+$stop-1,
272 $previous_stop = $stop;
273 $previous_start = $start;
278 sub output_current_cds {
279 my ($extents,$variant,$complement,$fields,$current_contig) = @_;
281 # gi is 224514618 (start 10001, stop 267719)
282 # complement(join(126698..126805,126953..129696,
283 # 129790..129847,130075..130566))
285 # /product="hypothetical protein LOC729737"
286 # /note="Derived by automated computational analysis using
287 # gene prediction method: GNOMON. Supporting evidence
288 # includes similarity to: 2 mRNAs, 34 ESTs, 1 Protein"
289 # /transcript_id="AM_001133863.3"
290 # /db_xref="GI:310118477"
291 # /db_xref="Aeneid:729737"
292 if (not exists $fields->{protein_id}) {
295 my @output_fields = (@{$fields}{qw(gi accession accession_ver)},
297 $complement ? 'TRUE':'FALSE',
298 @{$fields}{qw(gene geneid)},
299 @{$current_contig}{qw(gi chr)},
301 my $previous_stop = undef;
302 my $previous_start = undef;
304 my @exons = @{$extents};
306 @exons = reverse @exons;
309 my $current_phase = 0;
310 for my $exon (@exons) {
311 # strip off non-numerics like '>' and '<' here,
312 # because we don't want to deal with them
313 my ($start,$stop) = map {m/(\d+)/; $1;} split /\.\./,$exon;
314 if (not defined $stop) {
321 $current_contig->{chr_start}+$start-1,
322 $current_contig->{chr_start}+$stop-1,
327 $current_phase = ($current_phase + $stop - $start + 1) % 3;
329 $previous_stop = $stop;
330 $previous_start = $start;
331 $position += abs($start-$stop)+1;