]> git.donarmstrong.com Git - dbsnp.git/blobdiff - utils/load_affymetrix_probe_annotations.pl
add gcrma data and affymetrix probe annotation loaders
[dbsnp.git] / utils / load_affymetrix_probe_annotations.pl
diff --git a/utils/load_affymetrix_probe_annotations.pl b/utils/load_affymetrix_probe_annotations.pl
new file mode 100755 (executable)
index 0000000..4ef8ccf
--- /dev/null
@@ -0,0 +1,248 @@
+#!/usr/bin/perl
+# load_affymetrix_probe_annotations.pl loads affymetrix probe annotations
+# and is released under the terms of the GNU GPL version 3, or any
+# later version, at your option. See the file README and COPYING for
+# more information.
+# Copyright 2013 by Don Armstrong <don@donarmstrong.com>.
+
+
+use warnings;
+use strict;
+
+use Getopt::Long;
+use Pod::Usage;
+
+=head1 NAME
+
+load_affymetrix_probe_annotations.pl - loads affymetrix probe annotations
+
+=head1 SYNOPSIS
+
+load_affymetrix_probe_annotations.pl [options] [annotation files]
+
+ Options:
+  --service, -s pgsql service
+  --debug, -d debugging level (Default 0)
+  --help, -h display this help
+  --man, -m display manual
+
+=head1 OPTIONS
+
+=over
+
+=item B<--service,-s>
+
+Postgresql service
+
+=item B<--progress,-p>
+
+=item B<--debug, -d>
+
+Debug verbosity. (Default 0)
+
+=item B<--help, -h>
+
+Display brief usage information.
+
+=item B<--man, -m>
+
+Display this manual.
+
+=back
+
+=head1 EXAMPLES
+
+load_affymetrix_probe_annotations.pl
+
+=cut
+
+
+use vars qw($DEBUG);
+use DBI;
+use Term::ProgressBar;
+use Fcntl qw(:seek);
+use Text::CSV;
+
+
+my %options = (debug           => 0,
+               help            => 0,
+               man             => 0,
+               service         => 'snp',
+               progress        => 1,
+              );
+
+GetOptions(\%options,
+           'service|s=s',
+           'progress|p',
+           'debug|d+','help|h|?','man|m');
+
+pod2usage() if $options{help};
+pod2usage({verbose=>2}) if $options{man};
+
+$DEBUG = $options{debug};
+
+my @USAGE_ERRORS;
+# if (1) {
+#      push @USAGE_ERRORS,"You must pass something";
+# }
+
+pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
+
+my $dbh = DBI->connect("dbi:Pg:service=$options{service}",
+                       '','',{AutoCommit => 0}) or
+    die "Unable to connect to database: ".$DBI::errstr;
+
+my %sth;
+$sth{insert_annotation} = $dbh->prepare(<<'END') // die "Unable to prepare insert annotation statement: ".$dbh->errstr;
+INSERT INTO affy_annotation
+(probe,gene_symbol,
+gene_name,species,
+array_name,entrez_id,
+refseq_prot,refseq_transcript) VALUES ($1,$2,$3,$4,$5,$6,$7,$8);
+END
+
+$sth{delete_annotation_id} = $dbh->prepare(<<'END') // die "Unable to prepare delete annotation id statement: ".$dbh->errstr;
+DELETE FROM affy_annotation aa WHERE aa.probe = $1;
+END
+
+$sth{select_annotation_id} = $dbh->prepare(<<'END') // die "Unable to prepare select annotation id statement: ".$dbh->errstr;
+SELECT aa.id FROM affy_annotation aa WHERE aa.probe = $1;
+END
+
+
+$sth{select_affy_probe_id} = $dbh->prepare(<<'END') // die "Unable to prepare select annotation id statement: ".$dbh->errstr;
+SELECT id FROM affy_probe ap WHERE ap.probe = $1;
+END
+
+$sth{insert_affy_probe_id} = $dbh->prepare(<<'END') // die "Unable to prepare insert affy probe id statement: ".$dbh->errstr;
+INSERT INTO affy_probe (probe) VALUES ($1);
+END
+
+
+
+my @ifh;
+for my $ifn (@ARGV) {
+    my $ifh = IO::File->new($ifn,'r') or
+       die "Unable to open $ifn for reading: $!";
+    push @ifh,$ifh;
+}
+
+if (not @ARGV) {
+    push @ifh,\*STDIN;
+}
+
+my %header_regex =
+    (probe => qr/(?i)Probe\s*Set\s*ID/,
+     gene_symbol => qr/(?i)Gene\s*Symbol/,
+     gene_name => qr/(?i)Gene\s*Title/,
+     species => qr/(?i)Species\s*Scientific\s*Name/,
+     array_name => qr/(?i)GeneChip\s*Array/,
+     entrez_id => qr/(?i)Entrez\s*Gene/,
+     refseq_prot => qr/(?i)RefSeq\s*Protein\s*ID/,
+     refseq_transcript => qr/(?i)RefSeq\s*Transcript\s*ID/,
+    );
+
+for my $ifh (@ifh) {
+    my $p;
+    if ($options{progress}) {
+        if ($ifh->seek(0,SEEK_END)) {
+            $p = Term::ProgressBar->new({count => $ifh->tell,
+                                         remove => 1,
+                                         ETA=>'linear'});
+            $ifh->seek(0,SEEK_SET);
+        }
+    }
+    my @header;
+    my $csv = Text::CSV->new({sep_char=>','});
+    my %headers;
+    my %important_headers;
+    while (<$ifh>) {
+        chomp;
+        next if /^#/;
+        if (not $csv->parse($_)) {
+            die "Unable to parse line $. of file: ".$csv->error_diag();
+        }
+        my @row = $csv->fields();
+        if (not @header) {
+            @header = @row;
+            @headers{@header} = 0..$#row;
+            for my $header (keys %header_regex) {
+                my @match =
+                    grep { $_ =~ $header_regex{$header}
+                       } keys %headers;
+                $important_headers{$header} = $headers{$match[0]} if @match;
+                if (not @match) {
+                    use Data::Printer;
+                    p %important_headers;
+                    p @header;
+                    p %headers;
+                    die "unable to find header match for $header";
+                }
+            }
+            next;
+        }
+        insert_annotation($dbh,\%sth,
+                          {fixup_row(\%important_headers,\@row)
+                          },
+                         );
+        if (defined $p) {
+            $p->update($ifh->tell);
+        }
+    }
+    $dbh->commit();
+}
+
+sub insert_annotation {
+    my ($dbh,$sth,$annot) = @_;
+    # find the probe id
+    $annot->{probe_id} = select_affy_probe_id(@_);
+    # see if this annotation already exists
+    return unless defined $annot->{probe_id};
+    my $annot_id = select_annotation_id(@_);
+    # if not, insert it
+    if (not defined $annot_id) {
+        my $rv = $sth->{insert_annotation}->execute(@{$annot}{(qw(probe_id gene_symbol gene_name species array_name entrez_id),
+                                                               qw(refseq_prot refseq_transcript),
+                                                              )})
+            // die "Unable to execute statement properly: ".$dbh->errstr;
+        $sth->{insert_annotation}->finish;
+    } else {
+        print STDERR "probe: $annot->{probe} is already annotated ($annot_id)\n" if $DEBUG;
+    }
+}
+
+sub select_annotation_id {
+    my ($dbh,$sth,$annot) = @_;
+    if (not defined $annot->{probe_id}) {
+        $annot->{probe_id} = select_affy_probe_id(@_);
+        return unless defined $annot->{probe_id};
+    }
+    my $rv = $sth->{select_annotation_id}->execute($annot->{probe_id}) //
+        die "Unable to execute statement properly: ".$dbh->errstr;
+    my ($sample_id) = map {ref $_ ?@{$_}:()}
+        map {ref $_ ?@{$_}:()} $sth->{select_annotation_id}->fetchall_arrayref([0]);
+    $sth->{select_annotation_id}->finish;
+    return ($sample_id);
+}
+
+sub select_affy_probe_id {
+    my ($dbh,$sth,$annot) = @_;
+    my $rv = $sth->{select_affy_probe_id}->execute($annot->{probe}) //
+        die "Unable to execute statement properly: ".$dbh->errstr;
+    my ($probe_id) = map {ref $_ ?@{$_}:()}
+        map {ref $_ ?@{$_}:()} $sth->{select_affy_probe_id}->fetchall_arrayref([0]);
+    $sth->{select_affy_probe_id}->finish;
+    return ($probe_id);
+}
+
+sub fixup_row {
+    my ($ih,$r) = @_;
+    my %r; # return
+    for my $h (keys %{$ih}) {
+        $r{$h} = $r->[$ih->{$h}];
+    }
+    return %r;
+}
+
+
+__END__