--- /dev/null
+DROP TABLE affy_annotation;
+DROP TABLE gcrma_expression;
+DROP TABLE affy_probe;
+DROP TABLE gcrma_samples;
+
+CREATE TABLE gcrma_samples (
+ id SERIAL NOT NULL, -- PRIMARY KEY,
+ tissue TEXT NOT NULL
+);
+
+CREATE TABLE affy_probe (
+ id SERIAL NOT NULL, -- PRIMARY KEY,
+ probe TEXT NOT NULL
+);
+
+CREATE TABLE affy_annotation (
+ id SERIAL NOT NULL, -- PRIMARY KEY,
+ probe INT NOT NULL,-- REFERENCES affy_probe,
+ gene_symbol TEXT,
+ gene_name TEXT,
+ species TEXT,
+ array_name TEXT,
+ entrez_id TEXT,
+ refseq_prot TEXT,
+ refseq_transcript TEXT
+);
+
+CREATE TABLE gcrma_expression (
+ id SERIAL NOT NULL, -- PRIMARY KEY,
+ probe INT NOT NULL,-- REFERENCES affy_probe,
+ sample INT NOT NULL,-- REFERENCES gcrma_samples,
+ expression FLOAT NOT NULL
+);
+
+CREATE VIEW gcrma AS
+ SELECT ge.id AS id,
+ gs.tissue AS tissue,
+ ap.probe AS probe,
+ ge.expression AS expression,
+ aa.gene_symbol AS symbol
+ FROM gcrma_expression ge
+ JOIN affy_probe ap ON ge.probe=ap.id
+ JOIN gcrma_samples gs ON ge.sample=gs.id
+ LEFT OUTER JOIN affy_annotation aa ON ap.id=aa.probe;
+
+
--- /dev/null
+#!/usr/bin/perl
+# load_affymetrix_probe_annotations.pl loads affymetrix probe annotations
+# and is released under the terms of the GNU GPL version 3, or any
+# later version, at your option. See the file README and COPYING for
+# more information.
+# Copyright 2013 by Don Armstrong <don@donarmstrong.com>.
+
+
+use warnings;
+use strict;
+
+use Getopt::Long;
+use Pod::Usage;
+
+=head1 NAME
+
+load_affymetrix_probe_annotations.pl - loads affymetrix probe annotations
+
+=head1 SYNOPSIS
+
+load_affymetrix_probe_annotations.pl [options] [annotation files]
+
+ Options:
+ --service, -s pgsql service
+ --debug, -d debugging level (Default 0)
+ --help, -h display this help
+ --man, -m display manual
+
+=head1 OPTIONS
+
+=over
+
+=item B<--service,-s>
+
+Postgresql service
+
+=item B<--progress,-p>
+
+=item B<--debug, -d>
+
+Debug verbosity. (Default 0)
+
+=item B<--help, -h>
+
+Display brief usage information.
+
+=item B<--man, -m>
+
+Display this manual.
+
+=back
+
+=head1 EXAMPLES
+
+load_affymetrix_probe_annotations.pl
+
+=cut
+
+
+use vars qw($DEBUG);
+use DBI;
+use Term::ProgressBar;
+use Fcntl qw(:seek);
+use Text::CSV;
+
+
+my %options = (debug => 0,
+ help => 0,
+ man => 0,
+ service => 'snp',
+ progress => 1,
+ );
+
+GetOptions(\%options,
+ 'service|s=s',
+ 'progress|p',
+ 'debug|d+','help|h|?','man|m');
+
+pod2usage() if $options{help};
+pod2usage({verbose=>2}) if $options{man};
+
+$DEBUG = $options{debug};
+
+my @USAGE_ERRORS;
+# if (1) {
+# push @USAGE_ERRORS,"You must pass something";
+# }
+
+pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
+
+my $dbh = DBI->connect("dbi:Pg:service=$options{service}",
+ '','',{AutoCommit => 0}) or
+ die "Unable to connect to database: ".$DBI::errstr;
+
+my %sth;
+$sth{insert_annotation} = $dbh->prepare(<<'END') // die "Unable to prepare insert annotation statement: ".$dbh->errstr;
+INSERT INTO affy_annotation
+(probe,gene_symbol,
+gene_name,species,
+array_name,entrez_id,
+refseq_prot,refseq_transcript) VALUES ($1,$2,$3,$4,$5,$6,$7,$8);
+END
+
+$sth{delete_annotation_id} = $dbh->prepare(<<'END') // die "Unable to prepare delete annotation id statement: ".$dbh->errstr;
+DELETE FROM affy_annotation aa WHERE aa.probe = $1;
+END
+
+$sth{select_annotation_id} = $dbh->prepare(<<'END') // die "Unable to prepare select annotation id statement: ".$dbh->errstr;
+SELECT aa.id FROM affy_annotation aa WHERE aa.probe = $1;
+END
+
+
+$sth{select_affy_probe_id} = $dbh->prepare(<<'END') // die "Unable to prepare select annotation id statement: ".$dbh->errstr;
+SELECT id FROM affy_probe ap WHERE ap.probe = $1;
+END
+
+$sth{insert_affy_probe_id} = $dbh->prepare(<<'END') // die "Unable to prepare insert affy probe id statement: ".$dbh->errstr;
+INSERT INTO affy_probe (probe) VALUES ($1);
+END
+
+
+
+my @ifh;
+for my $ifn (@ARGV) {
+ my $ifh = IO::File->new($ifn,'r') or
+ die "Unable to open $ifn for reading: $!";
+ push @ifh,$ifh;
+}
+
+if (not @ARGV) {
+ push @ifh,\*STDIN;
+}
+
+my %header_regex =
+ (probe => qr/(?i)Probe\s*Set\s*ID/,
+ gene_symbol => qr/(?i)Gene\s*Symbol/,
+ gene_name => qr/(?i)Gene\s*Title/,
+ species => qr/(?i)Species\s*Scientific\s*Name/,
+ array_name => qr/(?i)GeneChip\s*Array/,
+ entrez_id => qr/(?i)Entrez\s*Gene/,
+ refseq_prot => qr/(?i)RefSeq\s*Protein\s*ID/,
+ refseq_transcript => qr/(?i)RefSeq\s*Transcript\s*ID/,
+ );
+
+for my $ifh (@ifh) {
+ my $p;
+ if ($options{progress}) {
+ if ($ifh->seek(0,SEEK_END)) {
+ $p = Term::ProgressBar->new({count => $ifh->tell,
+ remove => 1,
+ ETA=>'linear'});
+ $ifh->seek(0,SEEK_SET);
+ }
+ }
+ my @header;
+ my $csv = Text::CSV->new({sep_char=>','});
+ my %headers;
+ my %important_headers;
+ while (<$ifh>) {
+ chomp;
+ next if /^#/;
+ if (not $csv->parse($_)) {
+ die "Unable to parse line $. of file: ".$csv->error_diag();
+ }
+ my @row = $csv->fields();
+ if (not @header) {
+ @header = @row;
+ @headers{@header} = 0..$#row;
+ for my $header (keys %header_regex) {
+ my @match =
+ grep { $_ =~ $header_regex{$header}
+ } keys %headers;
+ $important_headers{$header} = $headers{$match[0]} if @match;
+ if (not @match) {
+ use Data::Printer;
+ p %important_headers;
+ p @header;
+ p %headers;
+ die "unable to find header match for $header";
+ }
+ }
+ next;
+ }
+ insert_annotation($dbh,\%sth,
+ {fixup_row(\%important_headers,\@row)
+ },
+ );
+ if (defined $p) {
+ $p->update($ifh->tell);
+ }
+ }
+ $dbh->commit();
+}
+
+sub insert_annotation {
+ my ($dbh,$sth,$annot) = @_;
+ # find the probe id
+ $annot->{probe_id} = select_affy_probe_id(@_);
+ # see if this annotation already exists
+ return unless defined $annot->{probe_id};
+ my $annot_id = select_annotation_id(@_);
+ # if not, insert it
+ if (not defined $annot_id) {
+ my $rv = $sth->{insert_annotation}->execute(@{$annot}{(qw(probe_id gene_symbol gene_name species array_name entrez_id),
+ qw(refseq_prot refseq_transcript),
+ )})
+ // die "Unable to execute statement properly: ".$dbh->errstr;
+ $sth->{insert_annotation}->finish;
+ } else {
+ print STDERR "probe: $annot->{probe} is already annotated ($annot_id)\n" if $DEBUG;
+ }
+}
+
+sub select_annotation_id {
+ my ($dbh,$sth,$annot) = @_;
+ if (not defined $annot->{probe_id}) {
+ $annot->{probe_id} = select_affy_probe_id(@_);
+ return unless defined $annot->{probe_id};
+ }
+ my $rv = $sth->{select_annotation_id}->execute($annot->{probe_id}) //
+ die "Unable to execute statement properly: ".$dbh->errstr;
+ my ($sample_id) = map {ref $_ ?@{$_}:()}
+ map {ref $_ ?@{$_}:()} $sth->{select_annotation_id}->fetchall_arrayref([0]);
+ $sth->{select_annotation_id}->finish;
+ return ($sample_id);
+}
+
+sub select_affy_probe_id {
+ my ($dbh,$sth,$annot) = @_;
+ my $rv = $sth->{select_affy_probe_id}->execute($annot->{probe}) //
+ die "Unable to execute statement properly: ".$dbh->errstr;
+ my ($probe_id) = map {ref $_ ?@{$_}:()}
+ map {ref $_ ?@{$_}:()} $sth->{select_affy_probe_id}->fetchall_arrayref([0]);
+ $sth->{select_affy_probe_id}->finish;
+ return ($probe_id);
+}
+
+sub fixup_row {
+ my ($ih,$r) = @_;
+ my %r; # return
+ for my $h (keys %{$ih}) {
+ $r{$h} = $r->[$ih->{$h}];
+ }
+ return %r;
+}
+
+
+__END__
--- /dev/null
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+
+use DBI;
+
+
+my $dbh = DBI->connect("dbi:Pg:service=snp",'','',{AutoCommit => 1}) or
+ die "Unable to connect to database: ".$DBI::errstr;
+
+my %sth;
+$sth{insert_sample} = $dbh->prepare(<<'END') // die "Unable to prepare insert sample statement: ".$dbh->errstr;
+INSERT INTO gcrma_samples (tissue) VALUES (?);
+END
+
+$sth{select_sample} = $dbh->prepare(<<'END') // die "Unable to prepare select sample statement: ".$dbh->errstr;
+SELECT * FROM gcrma_samples WHERE tissue = ?;
+END
+
+$sth{insert_probe} = $dbh->prepare(<<'END') // die "Unable to prepare insert probe statement: ".$dbh->errstr;
+INSERT INTO affy_probe (probe) VALUES (?);
+END
+
+$sth{select_probe} = $dbh->prepare(<<'END') // die "Unable to prepare select probe statement: ".$dbh->errstr;
+SELECT * FROM affy_probe WHERE probe = ?;
+END
+
+$sth{insert_reading} = $dbh->prepare(<<'END') // die "Unable to prepare insert reading statement: ".$dbh->errstr;
+INSERT INTO gcrma_expression (probe,sample,expression) VALUES (?,?,?);
+END
+
+
+
+my @samples;
+
+use Term::ProgressBar;
+use Fcntl qw(:seek);
+
+my @ifh;
+for my $ifn (@ARGV) {
+ my $ifh = IO::File->new($ifn,'r') or
+ die "Unable to open $ifn for reading: $!";
+ push @ifh,$ifh;
+}
+
+# read avg.csv file
+for my $ifh (@ifh) {
+ my $p;
+ if ($ifh->seek(0,SEEK_END)) {
+ $p = Term::ProgressBar->new({count => $ifh->tell,
+ remove => 1,
+ ETA=>'linear'});
+ $ifh->seek(0,SEEK_SET);
+ }
+ while (<$ifh>) {
+ chomp;
+ my @line = split /\s*,\s*/;
+ if (not @samples) {
+ shift @line;
+ for my $sample (@line) {
+ push @samples, insert_sample($dbh,\%sth,$sample);
+ }
+ next;
+ }
+ my $probe = insert_probe($dbh,\%sth,shift @line);
+ $dbh->do('COPY gcrma_expression (probe,sample,expression) FROM STDIN;');
+ for (0..$#line) {
+ $dbh->pg_putcopydata("$probe\t$samples[$_]\t$line[$_]\n");
+ }
+ $dbh->pg_putcopyend();
+ if (defined $p) {
+ $p->update($ifh->tell);
+ }
+ }
+}
+
+sub insert_sample{
+ my ($dbh,$sth,$sample) = @_;
+ my $rv = $sth->{insert_sample}->execute($sample) //
+ die "Unable to execute statement properly: ".$dbh->errstr;
+ $sth->{insert_sample}->finish;
+ return select_sample(@_);
+}
+
+sub select_sample {
+ my ($dbh,$sth,$sample) = @_;
+ my $rv = $sth->{select_sample}->execute($sample) //
+ die "Unable to execute statement properly: ".$dbh->errstr;
+ my ($sample_id) = map {ref $_ ?@{$_}:()}
+ map {ref $_ ?@{$_}:()} $sth->{select_sample}->fetchall_arrayref([0]);
+ $sth->{select_sample}->finish;
+ return $sample_id;
+}
+
+sub insert_probe{
+ my ($dbh,$sth,$probe) = @_;
+ my $rv = $sth->{insert_probe}->execute($probe) //
+ die "Unable to execute statement properly: ".$dbh->errstr;
+ $sth->{insert_probe}->finish;
+ return select_probe(@_);
+}
+
+sub select_probe {
+ my ($dbh,$sth,$probe) = @_;
+ my $rv = $sth->{select_probe}->execute($probe) //
+ die "Unable to execute statement properly: ".$dbh->errstr;
+ my ($probe_id) = map {ref $_ ?@{$_}:()}
+ map {ref $_ ?@{$_}:()} $sth->{select_probe}->fetchall_arrayref([0]);
+ $sth->{select_probe}->finish;
+ return $probe_id;
+}
+
+sub insert_reading{
+ my ($dbh,$sth,$probe,$sample,$reading) = @_;
+ my $rv = $sth->{insert_reading}->execute($probe,$sample,$reading) //
+ die "Unable to execute statement properly: ".$dbh->errstr;
+ $sth->{insert_reading}->finish;
+}
+