--- /dev/null
+#!/usr/bin/perl
+# load_encode_data.pl loads tracks from encode into an snp database
+# and is released under the terms of the GNU GPL version 3, or any
+# later version, at your option. See the file README and COPYING for
+# more information.
+# Copyright 2013 by Don Armstrong <don@donarmstrong.com>.
+
+
+use warnings;
+use strict;
+
+use Getopt::Long;
+use Pod::Usage;
+
+=head1 NAME
+
+load_encode_data.pl - loads tracks from encode into an snp database
+
+=head1 SYNOPSIS
+
+load_encode_data.pl [options] [filelist.txt]
+
+
+ Options:
+ --service, -s pgsql service
+ --progress, -p show progress bar
+ --debug, -d debugging level (Default 0)
+ --help, -h display this help
+ --man, -m display manual
+
+=head1 OPTIONS
+
+=over
+
+=item B<--service,-s>
+
+Postgresql service
+
+=item B<--progress,-p>
+
+Show a progress bar
+
+=item B<--debug, -d>
+
+Debug verbosity. (Default 0)
+
+=item B<--help, -h>
+
+Display brief usage information.
+
+=item B<--man, -m>
+
+Display this manual.
+
+=back
+
+=head1 EXAMPLES
+
+load_encode_data.pl
+
+=cut
+
+
+use vars qw($DEBUG);
+use DBI;
+use Term::ProgressBar;
+use Fcntl qw(:seek);
+use Text::CSV;
+use IO::Uncompress::Gunzip;
+
+my %options = (debug => 0,
+ help => 0,
+ man => 0,
+ service => 'snp',
+ progress => 1,
+ );
+
+GetOptions(\%options,
+ 'service|s=s',
+ 'progress|p!',
+ 'debug|d+','help|h|?','man|m');
+
+pod2usage() if $options{help};
+pod2usage({verbose=>2}) if $options{man};
+
+$DEBUG = $options{debug};
+
+my @USAGE_ERRORS;
+
+pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
+
+my $dbh = DBI->connect("dbi:Pg:service=$options{service}",
+ '','',{AutoCommit => 0}) or
+ die "Unable to connect to database: ".$DBI::errstr;
+
+my %sth;
+$sth{insert_track} = $dbh->prepare(<<'END') // die "Unable to prepare insert track statement: ".$dbh->errstr;
+INSERT INTO encode_tracks
+(track_name) VALUES ($1);
+END
+
+$sth{insert_track_info_keys} = $dbh->prepare(<<'END') // die "Unable to prepare insert track info keys statement: ".$dbh->errstr;
+INSERT INTO encode_track_info_keys
+(info_key) VALUES ($1);
+END
+
+$sth{insert_track_info} = $dbh->prepare(<<'END') // die "Unable to prepare insert track info statement: ".$dbh->errstr;
+INSERT INTO encode_track_info
+(track_id,info_key_id,info_value) VALUES ($1,$2,$3);
+END
+
+$sth{delete_track_info} = $dbh->prepare(<<'END') // die "Unable to prepare delete track info statement: ".$dbh->errstr;
+DELETE FROM encode_track_info WHERE track_id=$1;
+END
+
+
+$sth{insert_tf_binding_track} = $dbh->prepare(<<'END') // die "Unable to prepare insert tf_binding_track statement: ".$dbh->errstr;
+INSERT INTO encode_tf_binding_tracks
+(track_id,chr,start,stop,name,score,strand,signalValue,pValue,qValue,peak)
+VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11);
+END
+
+$sth{select_track_id} = $dbh->prepare(<<'END') // die "Unable to prepare select track id statement: ".$dbh->errstr;
+SELECT track.id FROM encode_tracks track WHERE track.track_name = $1;
+END
+
+$sth{select_info_key_id} = $dbh->prepare(<<'END') // die "Unable to prepare select info key id statement: ".$dbh->errstr;
+SELECT info_key.id FROM encode_track_info_keys info_key WHERE info_key.info_key = $1;
+END
+
+my @ifh;
+for my $ifn (@ARGV) {
+ my $ifh = IO::File->new($ifn,'r') or
+ die "Unable to open $ifn for reading: $!";
+ push @ifh,$ifh;
+}
+
+if (not @ARGV) {
+ push @ifh,\*STDIN;
+}
+
+my $p = Term::ProgressBar->new({count => 1,
+ remove => 1,
+ ETA => 'linear',});
+my %encode_data_files;
+for my $ifh (@ifh) {
+ while (<$ifh>) {
+ chomp;
+ my ($file,$info) = split /\t/,$_,2;
+
+ $encode_data_files{$file} =
+ {filename => $file,
+ map {my @t = split /=/,$_,2; @t} split /\s*;\s*/,$info,
+ };
+ }
+ close ($ifh);
+}
+
+my $total_files = scalar keys %encode_data_files;
+my $total_size = 0;
+my $average_size = 0;
+my $handled_files = 0;
+
+my %info_keys;
+for my $file (keys %encode_data_files) {
+ $handled_files++;
+ if (! -e $encode_data_files{$file}{filename}) {
+ print STDERR "Skipping $encode_data_files{$file}{filename}, no gzip\n";
+ next;
+ }
+ my $fh = IO::Uncompress::Gunzip->new($encode_data_files{$file}{filename},{MultiStream => 1}) or
+ die "Unable to open '$encode_data_files{$file}{filename}' for reading: $!";
+ my $p_fh = $fh;
+ if ($fh->can('getHeaderInfo')) {
+ # oh boy is this horrible.
+ $p_fh = *$fh->{FH};
+ }
+ my $cur_pos = $p_fh->tell;
+ $p_fh->seek(0,SEEK_END);
+ my $file_size = $p_fh->tell;
+ $p_fh->seek($cur_pos,SEEK_SET);
+ $average_size = ($average_size*($handled_files-1)+$file_size)/$handled_files;
+ $total_size=$total_size+$file_size;
+ $p->target($total_size+$average_size*($total_files-$handled_files));
+ my $track_id = insert_track($dbh,\%sth,\%info_keys,$encode_data_files{$file});
+ $dbh->do("COPY encode_tf_binding_tracks (track_id,chr,start,stop,name,score,strand,signalValue,pValue,qValue,peak) FROM STDIN");
+ # insert data for this track
+ while (<$fh>) {
+ chomp;
+ my @row = split /\t/,$_;
+ $p->update($total_size-$file_size+$p_fh->tell);
+ $row[0] =~ s/^chr//;
+ $dbh->pg_putcopydata(join("\t",$track_id,@row)."\n");
+ }
+ $dbh->pg_putcopyend();
+ $p->update($total_size);
+
+
+}
+
+sub select_one {
+ my ($dbh,$sth,$name,$bind,$pos) = @_;
+ $pos //= 0;
+
+ my $rv = $sth->{$name}->execute(@{$bind}) or
+ die "Unable to execute statement '$name'".$dbh->errstr();
+ my ($item) = map {ref $_ ?@{$_}:()}
+ map {ref $_ ?@{$_}:()} $sth->{$name}->fetchall_arrayref([$pos]);
+ $sth->{$name}->finish();
+ return $item;
+}
+
+sub insert_track {
+ my ($dbh,$sth,$info_keys,$encode_data_files) = @_;
+
+ my $track_name = $encode_data_files->{tableName}//$encode_data_files->{filename};
+
+ # insert the track file
+ my $track_id;
+ $track_id = select_one($dbh,$sth,'select_track_id',
+ [$track_name]);
+ if (defined $track_id) {
+ $sth->{delete_track_info}->execute($track_id) or
+ die "Unable to delete track information".$dbh->errstr();
+ } else {
+ $sth->{insert_track}->execute($track_name) or
+ die "Unable to insert track".$dbh->errstr();
+ $track_id = $dbh->last_insert_id((undef) x 2,'encode_tracks',undef);
+ if (not defined $track_id) {
+ die "Unable to get a valid track id after inserting a track";
+ }
+ }
+ # insert track info
+ for my $key (keys %{$encode_data_files}) {
+ my $info_key_id = $info_keys->{$key};
+ if (not defined $info_key_id) {
+ $info_key_id = select_one($dbh,$sth,'select_info_key_id',[$key]);
+ if (not defined $info_key_id) {
+ $sth->{insert_track_info_keys}->execute($key) or
+ die "Unable to insert track info".$dbh->errstr();
+ $info_key_id = $dbh->last_insert_id((undef) x 2,'encode_track_info_keys',undef);
+ if (not defined $info_key_id) {
+ die "Unable to get a valid track info key after inserting one";
+ }
+ }
+ $info_keys->{$key} = $info_key_id;
+ }
+ $sth->{insert_track_info}->execute($track_id,$info_key_id,$encode_data_files->{$key});
+ }
+ return $track_id;
+}
+
+
+__END__