From 9ef88bfb5d62395d67d80b0fc6e71ba92b15302c Mon Sep 17 00:00:00 2001 From: Don Armstrong Date: Tue, 21 May 2013 18:30:52 -0700 Subject: [PATCH] add gcrma data and affymetrix probe annotation loaders --- schema/extra_schema/gcrma_indexes.sql | 11 + schema/extra_schema/gcrma_table.sql | 46 ++++ utils/load_affymetrix_probe_annotations.pl | 248 +++++++++++++++++++++ utils/load_gcrma_data.pl | 120 ++++++++++ 4 files changed, 425 insertions(+) create mode 100644 schema/extra_schema/gcrma_indexes.sql create mode 100644 schema/extra_schema/gcrma_table.sql create mode 100755 utils/load_affymetrix_probe_annotations.pl create mode 100755 utils/load_gcrma_data.pl diff --git a/schema/extra_schema/gcrma_indexes.sql b/schema/extra_schema/gcrma_indexes.sql new file mode 100644 index 0000000..a6735bd --- /dev/null +++ b/schema/extra_schema/gcrma_indexes.sql @@ -0,0 +1,11 @@ +CREATE UNIQUE INDEX ON gcrma_samples(tissue); +CREATE UNIQUE INDEX ON affy_probe(probe); +CREATE UNIQUE INDEX on affy_annotation(probe); +CREATE UNIQUE INDEX ON gcrma_expression(probe,sample); +ALTER TABLE gcrma_samples ADD PRIMARY KEY (id); +ALTER TABLE affy_probe ADD PRIMARY KEY (id); +ALTER TABLE gcrma_expression ADD PRIMARY KEY (id); +ALTER TABLE affy_annotation ADD PRIMARY KEY (id); +ALTER TABLE affy_annotation ADD FOREIGN KEY (id) REFERENCES affy_probe; +ALTER TABLE gcrma_expression ADD FOREIGN KEY (probe) REFERENCES affy_probe; +ALTER TABLE gcrma_expression ADD FOREIGN KEY (sample) REFERENCES gcrma_samples; diff --git a/schema/extra_schema/gcrma_table.sql b/schema/extra_schema/gcrma_table.sql new file mode 100644 index 0000000..d76383c --- /dev/null +++ b/schema/extra_schema/gcrma_table.sql @@ -0,0 +1,46 @@ +DROP TABLE affy_annotation; +DROP TABLE gcrma_expression; +DROP TABLE affy_probe; +DROP TABLE gcrma_samples; + +CREATE TABLE gcrma_samples ( + id SERIAL NOT NULL, -- PRIMARY KEY, + tissue TEXT NOT NULL +); + +CREATE TABLE affy_probe ( + id SERIAL NOT NULL, -- PRIMARY KEY, + probe TEXT NOT NULL +); + +CREATE TABLE affy_annotation ( + id SERIAL NOT NULL, -- PRIMARY KEY, + probe INT NOT NULL,-- REFERENCES affy_probe, + gene_symbol TEXT, + gene_name TEXT, + species TEXT, + array_name TEXT, + entrez_id TEXT, + refseq_prot TEXT, + refseq_transcript TEXT +); + +CREATE TABLE gcrma_expression ( + id SERIAL NOT NULL, -- PRIMARY KEY, + probe INT NOT NULL,-- REFERENCES affy_probe, + sample INT NOT NULL,-- REFERENCES gcrma_samples, + expression FLOAT NOT NULL +); + +CREATE VIEW gcrma AS + SELECT ge.id AS id, + gs.tissue AS tissue, + ap.probe AS probe, + ge.expression AS expression, + aa.gene_symbol AS symbol + FROM gcrma_expression ge + JOIN affy_probe ap ON ge.probe=ap.id + JOIN gcrma_samples gs ON ge.sample=gs.id + LEFT OUTER JOIN affy_annotation aa ON ap.id=aa.probe; + + diff --git a/utils/load_affymetrix_probe_annotations.pl b/utils/load_affymetrix_probe_annotations.pl new file mode 100755 index 0000000..4ef8ccf --- /dev/null +++ b/utils/load_affymetrix_probe_annotations.pl @@ -0,0 +1,248 @@ +#!/usr/bin/perl +# load_affymetrix_probe_annotations.pl loads affymetrix probe annotations +# and is released under the terms of the GNU GPL version 3, or any +# later version, at your option. See the file README and COPYING for +# more information. +# Copyright 2013 by Don Armstrong . + + +use warnings; +use strict; + +use Getopt::Long; +use Pod::Usage; + +=head1 NAME + +load_affymetrix_probe_annotations.pl - loads affymetrix probe annotations + +=head1 SYNOPSIS + +load_affymetrix_probe_annotations.pl [options] [annotation files] + + Options: + --service, -s pgsql service + --debug, -d debugging level (Default 0) + --help, -h display this help + --man, -m display manual + +=head1 OPTIONS + +=over + +=item B<--service,-s> + +Postgresql service + +=item B<--progress,-p> + +=item B<--debug, -d> + +Debug verbosity. (Default 0) + +=item B<--help, -h> + +Display brief usage information. + +=item B<--man, -m> + +Display this manual. + +=back + +=head1 EXAMPLES + +load_affymetrix_probe_annotations.pl + +=cut + + +use vars qw($DEBUG); +use DBI; +use Term::ProgressBar; +use Fcntl qw(:seek); +use Text::CSV; + + +my %options = (debug => 0, + help => 0, + man => 0, + service => 'snp', + progress => 1, + ); + +GetOptions(\%options, + 'service|s=s', + 'progress|p', + 'debug|d+','help|h|?','man|m'); + +pod2usage() if $options{help}; +pod2usage({verbose=>2}) if $options{man}; + +$DEBUG = $options{debug}; + +my @USAGE_ERRORS; +# if (1) { +# push @USAGE_ERRORS,"You must pass something"; +# } + +pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS; + +my $dbh = DBI->connect("dbi:Pg:service=$options{service}", + '','',{AutoCommit => 0}) or + die "Unable to connect to database: ".$DBI::errstr; + +my %sth; +$sth{insert_annotation} = $dbh->prepare(<<'END') // die "Unable to prepare insert annotation statement: ".$dbh->errstr; +INSERT INTO affy_annotation +(probe,gene_symbol, +gene_name,species, +array_name,entrez_id, +refseq_prot,refseq_transcript) VALUES ($1,$2,$3,$4,$5,$6,$7,$8); +END + +$sth{delete_annotation_id} = $dbh->prepare(<<'END') // die "Unable to prepare delete annotation id statement: ".$dbh->errstr; +DELETE FROM affy_annotation aa WHERE aa.probe = $1; +END + +$sth{select_annotation_id} = $dbh->prepare(<<'END') // die "Unable to prepare select annotation id statement: ".$dbh->errstr; +SELECT aa.id FROM affy_annotation aa WHERE aa.probe = $1; +END + + +$sth{select_affy_probe_id} = $dbh->prepare(<<'END') // die "Unable to prepare select annotation id statement: ".$dbh->errstr; +SELECT id FROM affy_probe ap WHERE ap.probe = $1; +END + +$sth{insert_affy_probe_id} = $dbh->prepare(<<'END') // die "Unable to prepare insert affy probe id statement: ".$dbh->errstr; +INSERT INTO affy_probe (probe) VALUES ($1); +END + + + +my @ifh; +for my $ifn (@ARGV) { + my $ifh = IO::File->new($ifn,'r') or + die "Unable to open $ifn for reading: $!"; + push @ifh,$ifh; +} + +if (not @ARGV) { + push @ifh,\*STDIN; +} + +my %header_regex = + (probe => qr/(?i)Probe\s*Set\s*ID/, + gene_symbol => qr/(?i)Gene\s*Symbol/, + gene_name => qr/(?i)Gene\s*Title/, + species => qr/(?i)Species\s*Scientific\s*Name/, + array_name => qr/(?i)GeneChip\s*Array/, + entrez_id => qr/(?i)Entrez\s*Gene/, + refseq_prot => qr/(?i)RefSeq\s*Protein\s*ID/, + refseq_transcript => qr/(?i)RefSeq\s*Transcript\s*ID/, + ); + +for my $ifh (@ifh) { + my $p; + if ($options{progress}) { + if ($ifh->seek(0,SEEK_END)) { + $p = Term::ProgressBar->new({count => $ifh->tell, + remove => 1, + ETA=>'linear'}); + $ifh->seek(0,SEEK_SET); + } + } + my @header; + my $csv = Text::CSV->new({sep_char=>','}); + my %headers; + my %important_headers; + while (<$ifh>) { + chomp; + next if /^#/; + if (not $csv->parse($_)) { + die "Unable to parse line $. of file: ".$csv->error_diag(); + } + my @row = $csv->fields(); + if (not @header) { + @header = @row; + @headers{@header} = 0..$#row; + for my $header (keys %header_regex) { + my @match = + grep { $_ =~ $header_regex{$header} + } keys %headers; + $important_headers{$header} = $headers{$match[0]} if @match; + if (not @match) { + use Data::Printer; + p %important_headers; + p @header; + p %headers; + die "unable to find header match for $header"; + } + } + next; + } + insert_annotation($dbh,\%sth, + {fixup_row(\%important_headers,\@row) + }, + ); + if (defined $p) { + $p->update($ifh->tell); + } + } + $dbh->commit(); +} + +sub insert_annotation { + my ($dbh,$sth,$annot) = @_; + # find the probe id + $annot->{probe_id} = select_affy_probe_id(@_); + # see if this annotation already exists + return unless defined $annot->{probe_id}; + my $annot_id = select_annotation_id(@_); + # if not, insert it + if (not defined $annot_id) { + my $rv = $sth->{insert_annotation}->execute(@{$annot}{(qw(probe_id gene_symbol gene_name species array_name entrez_id), + qw(refseq_prot refseq_transcript), + )}) + // die "Unable to execute statement properly: ".$dbh->errstr; + $sth->{insert_annotation}->finish; + } else { + print STDERR "probe: $annot->{probe} is already annotated ($annot_id)\n" if $DEBUG; + } +} + +sub select_annotation_id { + my ($dbh,$sth,$annot) = @_; + if (not defined $annot->{probe_id}) { + $annot->{probe_id} = select_affy_probe_id(@_); + return unless defined $annot->{probe_id}; + } + my $rv = $sth->{select_annotation_id}->execute($annot->{probe_id}) // + die "Unable to execute statement properly: ".$dbh->errstr; + my ($sample_id) = map {ref $_ ?@{$_}:()} + map {ref $_ ?@{$_}:()} $sth->{select_annotation_id}->fetchall_arrayref([0]); + $sth->{select_annotation_id}->finish; + return ($sample_id); +} + +sub select_affy_probe_id { + my ($dbh,$sth,$annot) = @_; + my $rv = $sth->{select_affy_probe_id}->execute($annot->{probe}) // + die "Unable to execute statement properly: ".$dbh->errstr; + my ($probe_id) = map {ref $_ ?@{$_}:()} + map {ref $_ ?@{$_}:()} $sth->{select_affy_probe_id}->fetchall_arrayref([0]); + $sth->{select_affy_probe_id}->finish; + return ($probe_id); +} + +sub fixup_row { + my ($ih,$r) = @_; + my %r; # return + for my $h (keys %{$ih}) { + $r{$h} = $r->[$ih->{$h}]; + } + return %r; +} + + +__END__ diff --git a/utils/load_gcrma_data.pl b/utils/load_gcrma_data.pl new file mode 100755 index 0000000..4b42e2e --- /dev/null +++ b/utils/load_gcrma_data.pl @@ -0,0 +1,120 @@ +#!/usr/bin/perl + +use warnings; +use strict; + +use DBI; + + +my $dbh = DBI->connect("dbi:Pg:service=snp",'','',{AutoCommit => 1}) or + die "Unable to connect to database: ".$DBI::errstr; + +my %sth; +$sth{insert_sample} = $dbh->prepare(<<'END') // die "Unable to prepare insert sample statement: ".$dbh->errstr; +INSERT INTO gcrma_samples (tissue) VALUES (?); +END + +$sth{select_sample} = $dbh->prepare(<<'END') // die "Unable to prepare select sample statement: ".$dbh->errstr; +SELECT * FROM gcrma_samples WHERE tissue = ?; +END + +$sth{insert_probe} = $dbh->prepare(<<'END') // die "Unable to prepare insert probe statement: ".$dbh->errstr; +INSERT INTO affy_probe (probe) VALUES (?); +END + +$sth{select_probe} = $dbh->prepare(<<'END') // die "Unable to prepare select probe statement: ".$dbh->errstr; +SELECT * FROM affy_probe WHERE probe = ?; +END + +$sth{insert_reading} = $dbh->prepare(<<'END') // die "Unable to prepare insert reading statement: ".$dbh->errstr; +INSERT INTO gcrma_expression (probe,sample,expression) VALUES (?,?,?); +END + + + +my @samples; + +use Term::ProgressBar; +use Fcntl qw(:seek); + +my @ifh; +for my $ifn (@ARGV) { + my $ifh = IO::File->new($ifn,'r') or + die "Unable to open $ifn for reading: $!"; + push @ifh,$ifh; +} + +# read avg.csv file +for my $ifh (@ifh) { + my $p; + if ($ifh->seek(0,SEEK_END)) { + $p = Term::ProgressBar->new({count => $ifh->tell, + remove => 1, + ETA=>'linear'}); + $ifh->seek(0,SEEK_SET); + } + while (<$ifh>) { + chomp; + my @line = split /\s*,\s*/; + if (not @samples) { + shift @line; + for my $sample (@line) { + push @samples, insert_sample($dbh,\%sth,$sample); + } + next; + } + my $probe = insert_probe($dbh,\%sth,shift @line); + $dbh->do('COPY gcrma_expression (probe,sample,expression) FROM STDIN;'); + for (0..$#line) { + $dbh->pg_putcopydata("$probe\t$samples[$_]\t$line[$_]\n"); + } + $dbh->pg_putcopyend(); + if (defined $p) { + $p->update($ifh->tell); + } + } +} + +sub insert_sample{ + my ($dbh,$sth,$sample) = @_; + my $rv = $sth->{insert_sample}->execute($sample) // + die "Unable to execute statement properly: ".$dbh->errstr; + $sth->{insert_sample}->finish; + return select_sample(@_); +} + +sub select_sample { + my ($dbh,$sth,$sample) = @_; + my $rv = $sth->{select_sample}->execute($sample) // + die "Unable to execute statement properly: ".$dbh->errstr; + my ($sample_id) = map {ref $_ ?@{$_}:()} + map {ref $_ ?@{$_}:()} $sth->{select_sample}->fetchall_arrayref([0]); + $sth->{select_sample}->finish; + return $sample_id; +} + +sub insert_probe{ + my ($dbh,$sth,$probe) = @_; + my $rv = $sth->{insert_probe}->execute($probe) // + die "Unable to execute statement properly: ".$dbh->errstr; + $sth->{insert_probe}->finish; + return select_probe(@_); +} + +sub select_probe { + my ($dbh,$sth,$probe) = @_; + my $rv = $sth->{select_probe}->execute($probe) // + die "Unable to execute statement properly: ".$dbh->errstr; + my ($probe_id) = map {ref $_ ?@{$_}:()} + map {ref $_ ?@{$_}:()} $sth->{select_probe}->fetchall_arrayref([0]); + $sth->{select_probe}->finish; + return $probe_id; +} + +sub insert_reading{ + my ($dbh,$sth,$probe,$sample,$reading) = @_; + my $rv = $sth->{insert_reading}->execute($probe,$sample,$reading) // + die "Unable to execute statement properly: ".$dbh->errstr; + $sth->{insert_reading}->finish; +} + -- 2.39.2