2 # load_encode_data.pl loads tracks from encode into an snp database
3 # and is released under the terms of the GNU GPL version 3, or any
4 # later version, at your option. See the file README and COPYING for
6 # Copyright 2013 by Don Armstrong <don@donarmstrong.com>.
17 load_encode_data.pl - loads tracks from encode into an snp database
21 load_encode_data.pl [options] [filelist.txt]
25 --service, -s pgsql service
26 --progress, -p show progress bar
27 --debug, -d debugging level (Default 0)
28 --help, -h display this help
29 --man, -m display manual
39 =item B<--progress,-p>
45 Debug verbosity. (Default 0)
49 Display brief usage information.
66 use Term::ProgressBar;
69 use IO::Uncompress::Gunzip;
71 my %options = (debug => 0,
81 'debug|d+','help|h|?','man|m');
83 pod2usage() if $options{help};
84 pod2usage({verbose=>2}) if $options{man};
86 $DEBUG = $options{debug};
90 pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
92 my $dbh = DBI->connect("dbi:Pg:service=$options{service}",
93 '','',{AutoCommit => 0}) or
94 die "Unable to connect to database: ".$DBI::errstr;
97 $sth{insert_track} = $dbh->prepare(<<'END') // die "Unable to prepare insert track statement: ".$dbh->errstr;
98 INSERT INTO encode_tracks
99 (track_name) VALUES ($1);
102 $sth{insert_track_info_keys} = $dbh->prepare(<<'END') // die "Unable to prepare insert track info keys statement: ".$dbh->errstr;
103 INSERT INTO encode_track_info_keys
104 (info_key) VALUES ($1);
107 $sth{insert_track_info} = $dbh->prepare(<<'END') // die "Unable to prepare insert track info statement: ".$dbh->errstr;
108 INSERT INTO encode_track_info
109 (track_id,info_key_id,info_value) VALUES ($1,$2,$3);
112 $sth{delete_track_info} = $dbh->prepare(<<'END') // die "Unable to prepare delete track info statement: ".$dbh->errstr;
113 DELETE FROM encode_track_info WHERE track_id=$1;
117 $sth{insert_tf_binding_track} = $dbh->prepare(<<'END') // die "Unable to prepare insert tf_binding_track statement: ".$dbh->errstr;
118 INSERT INTO encode_tf_binding_tracks
119 (track_id,chr,start,stop,name,score,strand,signalValue,pValue,qValue,peak)
120 VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11);
123 $sth{select_track_id} = $dbh->prepare(<<'END') // die "Unable to prepare select track id statement: ".$dbh->errstr;
124 SELECT track.id FROM encode_tracks track WHERE track.track_name = $1;
127 $sth{select_info_key_id} = $dbh->prepare(<<'END') // die "Unable to prepare select info key id statement: ".$dbh->errstr;
128 SELECT info_key.id FROM encode_track_info_keys info_key WHERE info_key.info_key = $1;
132 for my $ifn (@ARGV) {
133 my $ifh = IO::File->new($ifn,'r') or
134 die "Unable to open $ifn for reading: $!";
142 my $p = Term::ProgressBar->new({count => 1,
145 my %encode_data_files;
149 my ($file,$info) = split /\t/,$_,2;
151 $encode_data_files{$file} =
153 map {my @t = split /=/,$_,2; @t} split /\s*;\s*/,$info,
159 my $total_files = scalar keys %encode_data_files;
161 my $average_size = 0;
162 my $handled_files = 0;
165 for my $file (keys %encode_data_files) {
167 if (! -e $encode_data_files{$file}{filename}) {
168 print STDERR "Skipping $encode_data_files{$file}{filename}, no gzip\n";
171 my $fh = IO::Uncompress::Gunzip->new($encode_data_files{$file}{filename},{MultiStream => 1}) or
172 die "Unable to open '$encode_data_files{$file}{filename}' for reading: $!";
174 if ($fh->can('getHeaderInfo')) {
175 # oh boy is this horrible.
178 my $cur_pos = $p_fh->tell;
179 $p_fh->seek(0,SEEK_END);
180 my $file_size = $p_fh->tell;
181 $p_fh->seek($cur_pos,SEEK_SET);
182 $average_size = ($average_size*($handled_files-1)+$file_size)/$handled_files;
183 $total_size=$total_size+$file_size;
184 $p->target($total_size+$average_size*($total_files-$handled_files));
185 my $track_id = insert_track($dbh,\%sth,\%info_keys,$encode_data_files{$file});
186 $dbh->do("COPY encode_tf_binding_tracks (track_id,chr,start,stop,name,score,strand,signalValue,pValue,qValue,peak) FROM STDIN");
187 # insert data for this track
190 my @row = split /\t/,$_;
191 $p->update($total_size-$file_size+$p_fh->tell);
193 $dbh->pg_putcopydata(join("\t",$track_id,@row)."\n");
195 $dbh->pg_putcopyend();
196 $p->update($total_size);
202 my ($dbh,$sth,$name,$bind,$pos) = @_;
205 my $rv = $sth->{$name}->execute(@{$bind}) or
206 die "Unable to execute statement '$name'".$dbh->errstr();
207 my ($item) = map {ref $_ ?@{$_}:()}
208 map {ref $_ ?@{$_}:()} $sth->{$name}->fetchall_arrayref([$pos]);
209 $sth->{$name}->finish();
214 my ($dbh,$sth,$info_keys,$encode_data_files) = @_;
216 my $track_name = $encode_data_files->{tableName}//$encode_data_files->{filename};
218 # insert the track file
220 $track_id = select_one($dbh,$sth,'select_track_id',
222 if (defined $track_id) {
223 $sth->{delete_track_info}->execute($track_id) or
224 die "Unable to delete track information".$dbh->errstr();
226 $sth->{insert_track}->execute($track_name) or
227 die "Unable to insert track".$dbh->errstr();
228 $track_id = $dbh->last_insert_id((undef) x 2,'encode_tracks',undef);
229 if (not defined $track_id) {
230 die "Unable to get a valid track id after inserting a track";
234 for my $key (keys %{$encode_data_files}) {
235 my $info_key_id = $info_keys->{$key};
236 if (not defined $info_key_id) {
237 $info_key_id = select_one($dbh,$sth,'select_info_key_id',[$key]);
238 if (not defined $info_key_id) {
239 $sth->{insert_track_info_keys}->execute($key) or
240 die "Unable to insert track info".$dbh->errstr();
241 $info_key_id = $dbh->last_insert_id((undef) x 2,'encode_track_info_keys',undef);
242 if (not defined $info_key_id) {
243 die "Unable to get a valid track info key after inserting one";
246 $info_keys->{$key} = $info_key_id;
248 $sth->{insert_track_info}->execute($track_id,$info_key_id,$encode_data_files->{$key});