From 49dd0f54d3777f94f2b014dba41702acff928e49 Mon Sep 17 00:00:00 2001 From: Don Armstrong Date: Thu, 29 Aug 2013 14:14:25 -0700 Subject: [PATCH] add routines to load encode data --- schema/extra_schema/encode_schema.sql | 39 ++++ utils/load_encode_data.pl | 254 ++++++++++++++++++++++++++ 2 files changed, 293 insertions(+) create mode 100644 schema/extra_schema/encode_schema.sql create mode 100755 utils/load_encode_data.pl diff --git a/schema/extra_schema/encode_schema.sql b/schema/extra_schema/encode_schema.sql new file mode 100644 index 0000000..f636e24 --- /dev/null +++ b/schema/extra_schema/encode_schema.sql @@ -0,0 +1,39 @@ +DROP TABLE encdoe_track_info CASCADE; +DROP TABLE encode_tf_binding_tracks CASCADE; +DROP TABLE encode_tracks CASCADE; +DROP TABLE encode_track_info_keys CASCADE; + +CREATE TABLE encode_tracks ( + id SERIAL PRIMARY KEY, + track_name TEXT UNIQUE +); + +CREATE TABLE encode_track_info_keys ( + id SERIAL PRIMARY KEY, + info_key TEXT UNIQUE +); + +CREATE TABLE encode_track_info ( + id SERIAL PRIMARY KEY, + track_id INT NOT NULL REFERENCES encode_tracks, + info_key_id INT NOT NULL REFERENCES encode_track_info_keys, + info_value TEXT NOT NULL +); + +CREATE UNIQUE INDEX ON encode_track_info(track_id,info_key_id); +CREATE INDEX ON encode_track_info(info_key_id,info_value); + +CREATE TABLE encode_tf_binding_tracks ( + id SERIAL PRIMARY KEY, + track_id INT NOT NULL REFERENCES encode_tracks, + chr TEXT NOT NULL, + start INT, + stop INT, + name TEXT NOT NULL DEFAULT '', + score INT NOT NULL, + strand TEXT NOT NULL DEFAULT '.', + signalValue FLOAT, + pValue FLOAT, + qValue FLOAT, + peak INT +); diff --git a/utils/load_encode_data.pl b/utils/load_encode_data.pl new file mode 100755 index 0000000..915e4f9 --- /dev/null +++ b/utils/load_encode_data.pl @@ -0,0 +1,254 @@ +#!/usr/bin/perl +# load_encode_data.pl loads tracks from encode into an snp database +# and is released under the terms of the GNU GPL version 3, or any +# later version, at your option. See the file README and COPYING for +# more information. +# Copyright 2013 by Don Armstrong . + + +use warnings; +use strict; + +use Getopt::Long; +use Pod::Usage; + +=head1 NAME + +load_encode_data.pl - loads tracks from encode into an snp database + +=head1 SYNOPSIS + +load_encode_data.pl [options] [filelist.txt] + + + Options: + --service, -s pgsql service + --progress, -p show progress bar + --debug, -d debugging level (Default 0) + --help, -h display this help + --man, -m display manual + +=head1 OPTIONS + +=over + +=item B<--service,-s> + +Postgresql service + +=item B<--progress,-p> + +Show a progress bar + +=item B<--debug, -d> + +Debug verbosity. (Default 0) + +=item B<--help, -h> + +Display brief usage information. + +=item B<--man, -m> + +Display this manual. + +=back + +=head1 EXAMPLES + +load_encode_data.pl + +=cut + + +use vars qw($DEBUG); +use DBI; +use Term::ProgressBar; +use Fcntl qw(:seek); +use Text::CSV; +use IO::Uncompress::Gunzip; + +my %options = (debug => 0, + help => 0, + man => 0, + service => 'snp', + progress => 1, + ); + +GetOptions(\%options, + 'service|s=s', + 'progress|p!', + 'debug|d+','help|h|?','man|m'); + +pod2usage() if $options{help}; +pod2usage({verbose=>2}) if $options{man}; + +$DEBUG = $options{debug}; + +my @USAGE_ERRORS; + +pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS; + +my $dbh = DBI->connect("dbi:Pg:service=$options{service}", + '','',{AutoCommit => 0}) or + die "Unable to connect to database: ".$DBI::errstr; + +my %sth; +$sth{insert_track} = $dbh->prepare(<<'END') // die "Unable to prepare insert track statement: ".$dbh->errstr; +INSERT INTO encode_tracks +(track_name) VALUES ($1); +END + +$sth{insert_track_info_keys} = $dbh->prepare(<<'END') // die "Unable to prepare insert track info keys statement: ".$dbh->errstr; +INSERT INTO encode_track_info_keys +(info_key) VALUES ($1); +END + +$sth{insert_track_info} = $dbh->prepare(<<'END') // die "Unable to prepare insert track info statement: ".$dbh->errstr; +INSERT INTO encode_track_info +(track_id,info_key_id,info_value) VALUES ($1,$2,$3); +END + +$sth{delete_track_info} = $dbh->prepare(<<'END') // die "Unable to prepare delete track info statement: ".$dbh->errstr; +DELETE FROM encode_track_info WHERE track_id=$1; +END + + +$sth{insert_tf_binding_track} = $dbh->prepare(<<'END') // die "Unable to prepare insert tf_binding_track statement: ".$dbh->errstr; +INSERT INTO encode_tf_binding_tracks +(track_id,chr,start,stop,name,score,strand,signalValue,pValue,qValue,peak) +VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11); +END + +$sth{select_track_id} = $dbh->prepare(<<'END') // die "Unable to prepare select track id statement: ".$dbh->errstr; +SELECT track.id FROM encode_tracks track WHERE track.track_name = $1; +END + +$sth{select_info_key_id} = $dbh->prepare(<<'END') // die "Unable to prepare select info key id statement: ".$dbh->errstr; +SELECT info_key.id FROM encode_track_info_keys info_key WHERE info_key.info_key = $1; +END + +my @ifh; +for my $ifn (@ARGV) { + my $ifh = IO::File->new($ifn,'r') or + die "Unable to open $ifn for reading: $!"; + push @ifh,$ifh; +} + +if (not @ARGV) { + push @ifh,\*STDIN; +} + +my $p = Term::ProgressBar->new({count => 1, + remove => 1, + ETA => 'linear',}); +my %encode_data_files; +for my $ifh (@ifh) { + while (<$ifh>) { + chomp; + my ($file,$info) = split /\t/,$_,2; + + $encode_data_files{$file} = + {filename => $file, + map {my @t = split /=/,$_,2; @t} split /\s*;\s*/,$info, + }; + } + close ($ifh); +} + +my $total_files = scalar keys %encode_data_files; +my $total_size = 0; +my $average_size = 0; +my $handled_files = 0; + +my %info_keys; +for my $file (keys %encode_data_files) { + $handled_files++; + if (! -e $encode_data_files{$file}{filename}) { + print STDERR "Skipping $encode_data_files{$file}{filename}, no gzip\n"; + next; + } + my $fh = IO::Uncompress::Gunzip->new($encode_data_files{$file}{filename},{MultiStream => 1}) or + die "Unable to open '$encode_data_files{$file}{filename}' for reading: $!"; + my $p_fh = $fh; + if ($fh->can('getHeaderInfo')) { + # oh boy is this horrible. + $p_fh = *$fh->{FH}; + } + my $cur_pos = $p_fh->tell; + $p_fh->seek(0,SEEK_END); + my $file_size = $p_fh->tell; + $p_fh->seek($cur_pos,SEEK_SET); + $average_size = ($average_size*($handled_files-1)+$file_size)/$handled_files; + $total_size=$total_size+$file_size; + $p->target($total_size+$average_size*($total_files-$handled_files)); + my $track_id = insert_track($dbh,\%sth,\%info_keys,$encode_data_files{$file}); + $dbh->do("COPY encode_tf_binding_tracks (track_id,chr,start,stop,name,score,strand,signalValue,pValue,qValue,peak) FROM STDIN"); + # insert data for this track + while (<$fh>) { + chomp; + my @row = split /\t/,$_; + $p->update($total_size-$file_size+$p_fh->tell); + $row[0] =~ s/^chr//; + $dbh->pg_putcopydata(join("\t",$track_id,@row)."\n"); + } + $dbh->pg_putcopyend(); + $p->update($total_size); + + +} + +sub select_one { + my ($dbh,$sth,$name,$bind,$pos) = @_; + $pos //= 0; + + my $rv = $sth->{$name}->execute(@{$bind}) or + die "Unable to execute statement '$name'".$dbh->errstr(); + my ($item) = map {ref $_ ?@{$_}:()} + map {ref $_ ?@{$_}:()} $sth->{$name}->fetchall_arrayref([$pos]); + $sth->{$name}->finish(); + return $item; +} + +sub insert_track { + my ($dbh,$sth,$info_keys,$encode_data_files) = @_; + + my $track_name = $encode_data_files->{tableName}//$encode_data_files->{filename}; + + # insert the track file + my $track_id; + $track_id = select_one($dbh,$sth,'select_track_id', + [$track_name]); + if (defined $track_id) { + $sth->{delete_track_info}->execute($track_id) or + die "Unable to delete track information".$dbh->errstr(); + } else { + $sth->{insert_track}->execute($track_name) or + die "Unable to insert track".$dbh->errstr(); + $track_id = $dbh->last_insert_id((undef) x 2,'encode_tracks',undef); + if (not defined $track_id) { + die "Unable to get a valid track id after inserting a track"; + } + } + # insert track info + for my $key (keys %{$encode_data_files}) { + my $info_key_id = $info_keys->{$key}; + if (not defined $info_key_id) { + $info_key_id = select_one($dbh,$sth,'select_info_key_id',[$key]); + if (not defined $info_key_id) { + $sth->{insert_track_info_keys}->execute($key) or + die "Unable to insert track info".$dbh->errstr(); + $info_key_id = $dbh->last_insert_id((undef) x 2,'encode_track_info_keys',undef); + if (not defined $info_key_id) { + die "Unable to get a valid track info key after inserting one"; + } + } + $info_keys->{$key} = $info_key_id; + } + $sth->{insert_track_info}->execute($track_id,$info_key_id,$encode_data_files->{$key}); + } + return $track_id; +} + + +__END__ -- 2.39.2