]> git.donarmstrong.com Git - dbsnp.git/commitdiff
add routines to load encode data master
authorDon Armstrong <don@donarmstrong.com>
Thu, 29 Aug 2013 21:14:25 +0000 (14:14 -0700)
committerDon Armstrong <don@donarmstrong.com>
Thu, 29 Aug 2013 21:14:25 +0000 (14:14 -0700)
schema/extra_schema/encode_schema.sql [new file with mode: 0644]
utils/load_encode_data.pl [new file with mode: 0755]

diff --git a/schema/extra_schema/encode_schema.sql b/schema/extra_schema/encode_schema.sql
new file mode 100644 (file)
index 0000000..f636e24
--- /dev/null
@@ -0,0 +1,39 @@
+DROP TABLE encdoe_track_info CASCADE;
+DROP TABLE encode_tf_binding_tracks CASCADE;
+DROP TABLE encode_tracks CASCADE;
+DROP TABLE encode_track_info_keys CASCADE;
+
+CREATE TABLE encode_tracks (
+       id SERIAL PRIMARY KEY,
+       track_name TEXT UNIQUE
+);
+
+CREATE TABLE encode_track_info_keys (
+       id SERIAL PRIMARY KEY,
+       info_key TEXT UNIQUE
+);
+
+CREATE TABLE encode_track_info (
+       id SERIAL PRIMARY KEY,
+       track_id INT NOT NULL REFERENCES encode_tracks,
+       info_key_id INT NOT NULL REFERENCES encode_track_info_keys,
+       info_value TEXT NOT NULL
+);
+
+CREATE UNIQUE INDEX ON encode_track_info(track_id,info_key_id);
+CREATE INDEX ON encode_track_info(info_key_id,info_value);
+
+CREATE TABLE encode_tf_binding_tracks (
+       id SERIAL PRIMARY KEY,
+       track_id INT NOT NULL REFERENCES encode_tracks,
+       chr TEXT NOT NULL,
+       start INT,
+       stop INT,
+       name TEXT NOT NULL DEFAULT '',
+       score INT NOT NULL,
+       strand TEXT NOT NULL DEFAULT '.',
+       signalValue FLOAT,
+       pValue FLOAT,
+       qValue FLOAT,
+       peak INT
+);
diff --git a/utils/load_encode_data.pl b/utils/load_encode_data.pl
new file mode 100755 (executable)
index 0000000..915e4f9
--- /dev/null
@@ -0,0 +1,254 @@
+#!/usr/bin/perl
+# load_encode_data.pl loads tracks from encode into an snp database
+# and is released under the terms of the GNU GPL version 3, or any
+# later version, at your option. See the file README and COPYING for
+# more information.
+# Copyright 2013 by Don Armstrong <don@donarmstrong.com>.
+
+
+use warnings;
+use strict;
+
+use Getopt::Long;
+use Pod::Usage;
+
+=head1 NAME
+
+load_encode_data.pl - loads tracks from encode into an snp database
+
+=head1 SYNOPSIS
+
+load_encode_data.pl [options] [filelist.txt]
+
+
+ Options:
+  --service, -s pgsql service
+  --progress, -p show progress bar
+  --debug, -d debugging level (Default 0)
+  --help, -h display this help
+  --man, -m display manual
+
+=head1 OPTIONS
+
+=over
+
+=item B<--service,-s>
+
+Postgresql service
+
+=item B<--progress,-p>
+
+Show a progress bar
+
+=item B<--debug, -d>
+
+Debug verbosity. (Default 0)
+
+=item B<--help, -h>
+
+Display brief usage information.
+
+=item B<--man, -m>
+
+Display this manual.
+
+=back
+
+=head1 EXAMPLES
+
+load_encode_data.pl
+
+=cut
+
+
+use vars qw($DEBUG);
+use DBI;
+use Term::ProgressBar;
+use Fcntl qw(:seek);
+use Text::CSV;
+use IO::Uncompress::Gunzip;
+
+my %options = (debug           => 0,
+               help            => 0,
+               man             => 0,
+               service         => 'snp',
+               progress        => 1,
+              );
+
+GetOptions(\%options,
+           'service|s=s',
+           'progress|p!',
+           'debug|d+','help|h|?','man|m');
+
+pod2usage() if $options{help};
+pod2usage({verbose=>2}) if $options{man};
+
+$DEBUG = $options{debug};
+
+my @USAGE_ERRORS;
+
+pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
+
+my $dbh = DBI->connect("dbi:Pg:service=$options{service}",
+                       '','',{AutoCommit => 0}) or
+    die "Unable to connect to database: ".$DBI::errstr;
+
+my %sth;
+$sth{insert_track} = $dbh->prepare(<<'END') // die "Unable to prepare insert track statement: ".$dbh->errstr;
+INSERT INTO encode_tracks
+(track_name) VALUES ($1);
+END
+
+$sth{insert_track_info_keys} = $dbh->prepare(<<'END') // die "Unable to prepare insert track info keys statement: ".$dbh->errstr;
+INSERT INTO encode_track_info_keys
+(info_key) VALUES ($1);
+END
+
+$sth{insert_track_info} = $dbh->prepare(<<'END') // die "Unable to prepare insert track info statement: ".$dbh->errstr;
+INSERT INTO encode_track_info
+(track_id,info_key_id,info_value) VALUES ($1,$2,$3);
+END
+
+$sth{delete_track_info} = $dbh->prepare(<<'END') // die "Unable to prepare delete track info statement: ".$dbh->errstr;
+DELETE FROM encode_track_info WHERE track_id=$1;
+END
+
+
+$sth{insert_tf_binding_track} = $dbh->prepare(<<'END') // die "Unable to prepare insert tf_binding_track statement: ".$dbh->errstr;
+INSERT INTO encode_tf_binding_tracks
+(track_id,chr,start,stop,name,score,strand,signalValue,pValue,qValue,peak)
+VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11);
+END
+
+$sth{select_track_id} = $dbh->prepare(<<'END') // die "Unable to prepare select track id statement: ".$dbh->errstr;
+SELECT track.id FROM encode_tracks track WHERE track.track_name = $1;
+END
+
+$sth{select_info_key_id} = $dbh->prepare(<<'END') // die "Unable to prepare select info key id statement: ".$dbh->errstr;
+SELECT info_key.id FROM encode_track_info_keys info_key WHERE info_key.info_key = $1;
+END
+
+my @ifh;
+for my $ifn (@ARGV) {
+    my $ifh = IO::File->new($ifn,'r') or
+        die "Unable to open $ifn for reading: $!";
+    push @ifh,$ifh;
+}
+
+if (not @ARGV) {
+    push @ifh,\*STDIN;
+}
+
+my $p = Term::ProgressBar->new({count   => 1,
+                                remove  => 1,
+                                ETA     => 'linear',});
+my %encode_data_files;
+for my $ifh (@ifh) {
+    while (<$ifh>) {
+        chomp;
+        my ($file,$info) = split /\t/,$_,2;
+
+        $encode_data_files{$file} =
+            {filename => $file,
+             map {my @t = split /=/,$_,2; @t} split /\s*;\s*/,$info,
+            };
+    }
+    close ($ifh);
+}
+
+my $total_files = scalar keys %encode_data_files;
+my $total_size = 0;
+my $average_size = 0;
+my $handled_files = 0;
+
+my %info_keys;
+for my $file (keys %encode_data_files) {
+    $handled_files++;
+    if (! -e $encode_data_files{$file}{filename}) {
+        print STDERR "Skipping $encode_data_files{$file}{filename}, no gzip\n";
+        next;
+    }
+    my $fh = IO::Uncompress::Gunzip->new($encode_data_files{$file}{filename},{MultiStream => 1}) or
+        die "Unable to open '$encode_data_files{$file}{filename}' for reading: $!";
+    my $p_fh = $fh;
+    if ($fh->can('getHeaderInfo')) {
+        # oh boy is this horrible.
+        $p_fh = *$fh->{FH};
+    }
+    my $cur_pos = $p_fh->tell;
+    $p_fh->seek(0,SEEK_END);
+    my $file_size = $p_fh->tell;
+    $p_fh->seek($cur_pos,SEEK_SET);
+    $average_size = ($average_size*($handled_files-1)+$file_size)/$handled_files;
+    $total_size=$total_size+$file_size;
+    $p->target($total_size+$average_size*($total_files-$handled_files));
+    my $track_id = insert_track($dbh,\%sth,\%info_keys,$encode_data_files{$file});
+    $dbh->do("COPY encode_tf_binding_tracks (track_id,chr,start,stop,name,score,strand,signalValue,pValue,qValue,peak) FROM STDIN");
+    # insert data for this track
+    while (<$fh>) {
+        chomp;
+        my @row = split /\t/,$_;
+        $p->update($total_size-$file_size+$p_fh->tell);
+        $row[0] =~ s/^chr//;
+        $dbh->pg_putcopydata(join("\t",$track_id,@row)."\n");
+    }
+    $dbh->pg_putcopyend();
+    $p->update($total_size);
+
+
+}
+
+sub select_one {
+    my ($dbh,$sth,$name,$bind,$pos) = @_;
+    $pos //= 0;
+
+    my $rv = $sth->{$name}->execute(@{$bind}) or
+        die "Unable to execute statement '$name'".$dbh->errstr();
+    my ($item) = map {ref $_ ?@{$_}:()}
+        map {ref $_ ?@{$_}:()} $sth->{$name}->fetchall_arrayref([$pos]);
+    $sth->{$name}->finish();
+    return $item;
+}
+
+sub insert_track {
+    my ($dbh,$sth,$info_keys,$encode_data_files) = @_;
+
+    my $track_name = $encode_data_files->{tableName}//$encode_data_files->{filename};
+
+    # insert the track file
+    my $track_id;
+    $track_id = select_one($dbh,$sth,'select_track_id',
+                           [$track_name]);
+    if (defined $track_id) {
+        $sth->{delete_track_info}->execute($track_id) or
+            die "Unable to delete track information".$dbh->errstr();
+    } else {
+        $sth->{insert_track}->execute($track_name) or
+            die "Unable to insert track".$dbh->errstr();
+        $track_id = $dbh->last_insert_id((undef) x 2,'encode_tracks',undef);
+        if (not defined $track_id) {
+            die "Unable to get a valid track id after inserting a track";
+        }
+    }
+    # insert track info
+    for my $key (keys %{$encode_data_files}) {
+        my $info_key_id = $info_keys->{$key};
+        if (not defined $info_key_id) {
+            $info_key_id = select_one($dbh,$sth,'select_info_key_id',[$key]);
+            if (not defined $info_key_id) {
+                $sth->{insert_track_info_keys}->execute($key) or
+                    die "Unable to insert track info".$dbh->errstr();
+                $info_key_id = $dbh->last_insert_id((undef) x 2,'encode_track_info_keys',undef);
+                if (not defined $info_key_id) {
+                    die "Unable to get a valid track info key after inserting one";
+                }
+            }
+            $info_keys->{$key} = $info_key_id;
+        }
+        $sth->{insert_track_info}->execute($track_id,$info_key_id,$encode_data_files->{$key});
+    }
+    return $track_id;
+}
+
+
+__END__