3 # Copyright (C) 2007 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Stuff for interacting with UCSC genome browser
28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
33 use vars qw ( @ISA @EXPORT );
62 @ISA = qw( Exporter );
65 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BED format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
68 # http://genome.ucsc.edu/goldenPath/help/hgTracksHelp.html#BED
71 sub bed_entry_get_array
73 # Martin A. Hansen, September 2008.
75 # Reads a BED entry given a filehandle.
77 # This is a new _faster_ BED entry parser that
78 # uses arrays and not hashrefs.
80 # IMPORTANT! This function does not correct the
81 # BED_END position that is kept the way UCSC
84 my ( $fh, # file handle
85 $cols, # columns to read - OPTIONAL (3,4,5,6, or 12)
94 $line =~ tr/\n\r//d; # some people have carriage returns in their BED files -> Grrrr
96 return if not defined $line;
98 if ( not defined $cols ) {
99 $cols = 1 + $line =~ tr/\t//;
102 @entry = split "\t", $line, $cols + 1;
104 pop @entry if scalar @entry > $cols;
106 return wantarray ? @entry : \@entry;
112 # Martin A. Hansen, December 2007.
114 # Reads a bed entry given a filehandle.
116 my ( $fh, # file handle
117 $columns, # number of BED columns to read - OPTIONAL
122 my ( $line, @fields, %entry );
126 $line =~ tr/\n\r//d; # some people have carriage returns in their BED files -> Grrrr
128 return if not defined $line;
130 @fields = split "\t", $line;
132 $columns ||= scalar @fields;
137 "CHR" => $fields[ 0 ],
138 "CHR_BEG" => $fields[ 1 ],
139 "CHR_END" => $fields[ 2 ] - 1,
142 elsif ( $columns == 4 )
145 "CHR" => $fields[ 0 ],
146 "CHR_BEG" => $fields[ 1 ],
147 "CHR_END" => $fields[ 2 ] - 1,
148 "Q_ID" => $fields[ 3 ],
151 elsif ( $columns == 5 )
154 "CHR" => $fields[ 0 ],
155 "CHR_BEG" => $fields[ 1 ],
156 "CHR_END" => $fields[ 2 ] - 1,
157 "Q_ID" => $fields[ 3 ],
158 "SCORE" => $fields[ 4 ],
161 elsif ( $columns == 6 )
164 "CHR" => $fields[ 0 ],
165 "CHR_BEG" => $fields[ 1 ],
166 "CHR_END" => $fields[ 2 ] - 1,
167 "Q_ID" => $fields[ 3 ],
168 "SCORE" => $fields[ 4 ],
169 "STRAND" => $fields[ 5 ],
172 elsif ( $columns == 12 )
175 "CHR" => $fields[ 0 ],
176 "CHR_BEG" => $fields[ 1 ],
177 "CHR_END" => $fields[ 2 ] - 1,
178 "Q_ID" => $fields[ 3 ],
179 "SCORE" => $fields[ 4 ],
180 "STRAND" => $fields[ 5 ],
181 "THICK_BEG" => $fields[ 6 ],
182 "THICK_END" => $fields[ 7 ] - 1,
183 "COLOR" => $fields[ 8 ],
184 "BLOCK_COUNT" => $fields[ 9 ],
185 "BLOCK_LENS" => $fields[ 10 ],
186 "Q_BEGS" => $fields[ 11 ],
191 Maasha::Common::error( qq(Bad BED format in line->$line<-) );
194 $entry{ "REC_TYPE" } = "BED";
195 $entry{ "BED_LEN" } = $entry{ "CHR_END" } - $entry{ "CHR_BEG" } + 1;
196 $entry{ "BED_COLS" } = $columns;
198 return wantarray ? %entry : \%entry;
204 # Martin A. Hansen, January 2008.
206 # Given a path to a BED file, read in all entries
209 my ( $path, # full path to BED file
210 $columns, # number of columns in BED file - OPTIONAL (but is faster)
215 my ( $fh, $entry, @list );
217 $fh = Maasha::Common::read_open( $path );
219 while ( $entry = bed_get_entry( $fh ) ) {
225 return wantarray ? @list : \@list;
229 sub bed_entry_put_array
231 # Martin A. Hansen, Septermber 2008.
233 # Writes a BED entry array to file.
235 # IMPORTANT! This function does not correct the
236 # BED_END position that is assumed to be in the
237 # UCSC positions scheme.
240 $fh, # file handle - OPTIONAL
241 $cols, # number of columns in BED file - OPTIONAL
246 $fh = \*STDOUT if not defined $fh;
248 if ( defined $cols ) {
249 print $fh join( "\t", @{ $record }[ 0 .. $cols - 1 ] ), "\n";
251 print $fh join( "\t", @{ $record } ), "\n";
258 # Martin A. Hansen, Septermber 2007.
260 # Writes a BED entry to file.
262 # NB, this could really be more robust!?
264 my ( $record, # hashref
265 $fh, # file handle - OPTIONAL
266 $columns, # number of columns in BED file - OPTIONAL (but is faster)
273 $columns ||= 12; # max number of columns possible
277 push @fields, $record->{ "CHR" };
278 push @fields, $record->{ "CHR_BEG" };
279 push @fields, $record->{ "CHR_END" } + 1;
281 elsif ( $columns == 4 )
283 $record->{ "Q_ID" } =~ s/\s+/_/g;
285 push @fields, $record->{ "CHR" };
286 push @fields, $record->{ "CHR_BEG" };
287 push @fields, $record->{ "CHR_END" } + 1;
288 push @fields, $record->{ "Q_ID" };
290 elsif ( $columns == 5 )
292 $record->{ "Q_ID" } =~ s/\s+/_/g;
293 $record->{ "SCORE" } =~ s/\.\d*//;
295 push @fields, $record->{ "CHR" };
296 push @fields, $record->{ "CHR_BEG" };
297 push @fields, $record->{ "CHR_END" } + 1;
298 push @fields, $record->{ "Q_ID" };
299 push @fields, $record->{ "SCORE" };
301 elsif ( $columns == 6 )
303 $record->{ "Q_ID" } =~ s/\s+/_/g;
304 $record->{ "SCORE" } =~ s/\.\d*//;
306 push @fields, $record->{ "CHR" };
307 push @fields, $record->{ "CHR_BEG" };
308 push @fields, $record->{ "CHR_END" } + 1;
309 push @fields, $record->{ "Q_ID" };
310 push @fields, $record->{ "SCORE" };
311 push @fields, $record->{ "STRAND" };
315 $record->{ "Q_ID" } =~ s/\s+/_/g;
316 $record->{ "SCORE" } =~ s/\.\d*//;
318 push @fields, $record->{ "CHR" };
319 push @fields, $record->{ "CHR_BEG" };
320 push @fields, $record->{ "CHR_END" } + 1;
321 push @fields, $record->{ "Q_ID" };
322 push @fields, $record->{ "SCORE" };
323 push @fields, $record->{ "STRAND" };
324 push @fields, $record->{ "THICK_BEG" } if defined $record->{ "THICK_BEG" };
325 push @fields, $record->{ "THICK_END" } + 1 if defined $record->{ "THICK_END" };
326 push @fields, $record->{ "COLOR" } if defined $record->{ "COLOR" };
327 push @fields, $record->{ "BLOCK_COUNT" } if defined $record->{ "BLOCK_COUNT" };
328 push @fields, $record->{ "BLOCK_LENS" } if defined $record->{ "BLOCK_LENS" };
329 push @fields, $record->{ "Q_BEGS" } if defined $record->{ "Q_BEGS" };
333 print $fh join( "\t", @fields ), "\n";
335 print join( "\t", @fields ), "\n";
342 # Martin A. Hansen, January 2008.
344 # Write a list of BED entries.
346 my ( $entries, # list of entries,
347 $fh, # file handle - OPTIONAL
352 map { bed_put_entry( $_, $fh ) } @{ $entries };
358 # Martin A. Hansen, March 2008.
360 # Given a bed record, analysis this to give information
361 # about intron/exon sizes.
363 my ( $entry, # BED entry
368 my ( $i, @begs, @lens, $exon_max, $exon_min, $exon_len, $exon_tot, $intron_max, $intron_min, $intron_len, $intron_tot );
371 $exon_min = 9999999999;
373 $intron_min = 9999999999;
375 $entry->{ "EXONS" } = $entry->{ "BLOCK_COUNT" };
377 @begs = split /,/, $entry->{ "Q_BEGS" };
378 @lens = split /,/, $entry->{ "BLOCK_LENS" };
380 for ( $i = 0; $i < $entry->{ "BLOCK_COUNT" }; $i++ )
382 $exon_len = $lens[ $i ];
384 $entry->{ "EXON_LEN_$i" } = $exon_len;
386 $exon_max = $exon_len if $exon_len > $exon_max;
387 $exon_min = $exon_len if $exon_len < $exon_min;
389 $exon_tot += $exon_len;
392 $entry->{ "EXON_LEN_-1" } = $exon_len;
393 $entry->{ "EXON_MAX_LEN" } = $exon_max;
394 $entry->{ "EXON_MIN_LEN" } = $exon_min;
395 $entry->{ "EXON_MEAN_LEN" } = int( $exon_tot / $entry->{ "EXONS" } );
397 $entry->{ "INTRONS" } = $entry->{ "BLOCK_COUNT" } - 1;
398 $entry->{ "INTRONS" } = 0 if $entry->{ "INTRONS" } < 0;
400 if ( $entry->{ "INTRONS" } )
402 for ( $i = 1; $i < $entry->{ "BLOCK_COUNT" }; $i++ )
404 $intron_len = $begs[ $i ] - ( $begs[ $i - 1 ] + $lens[ $i - 1 ] );
406 $entry->{ "INTRON_LEN_" . ( $i - 1 ) } = $intron_len;
408 $intron_max = $intron_len if $intron_len > $intron_max;
409 $intron_min = $intron_len if $intron_len < $intron_min;
411 $intron_tot += $intron_len;
414 $entry->{ "INTRON_LEN_-1" } = $intron_len;
415 $entry->{ "INTRON_MAX_LEN" } = $intron_max;
416 $entry->{ "INTRON_MIN_LEN" } = $intron_min;
417 $entry->{ "INTRON_MEAN_LEN" } = int( $intron_tot / $entry->{ "INTRONS" } );
420 return wantarray ? %{ $entry } : $entry;
426 # Martin A. Hansen, Septermber 2008
428 # Sorts a BED file using the c program
429 # "bed_sort" specifing a sort mode:
431 # 1: chr AND chr_beg.
432 # 2: chr AND strand AND chr_beg.
434 # 4: strand AND chr_beg.
436 my ( $bed_file, # BED file to sort
437 $sort_mode, # See above.
438 $cols, # Number of columns in BED file
441 &Maasha::Common::run( "bed_sort", "--sort $sort_mode --cols $cols $bed_file" );
445 sub bed_split_to_files
447 # Martin A. Hansen, Septermber 2008
449 # Given a list of BED files, split these
450 # into temporary files based on the chromosome
451 # name. Returns a list of the temporary files.
453 my ( $bed_files, # list of BED files to split
454 $cols, # number of columns
455 $tmp_dir, # temporary directory
460 my ( $bed_file, $fh_in, $entry, $key, %fh_hash, @tmp_files );
462 foreach $bed_file ( @{ $bed_files } )
464 $fh_in = Maasha::Common::read_open( $bed_file );
466 while ( $entry = bed_entry_get_array( $fh_in, $cols ) )
468 $key = $entry->[ CHR ];
470 $fh_hash{ $key } = Maasha::Common::write_open( "$tmp_dir/$key.temp" ) if not exists $fh_hash{ $key };
472 bed_entry_put_array( $entry, $fh_hash{ $key } );
478 foreach $key ( sort keys %fh_hash )
480 push @tmp_files, "$tmp_dir/$key.temp";
482 close $fh_hash{ $key };
485 return wantarray ? @tmp_files : \@tmp_files;
489 sub bed_merge_entries
491 # Martin A. Hansen, February 2008.
493 # Merge a list of given BED entries in one big entry.
495 my ( $entries, # list of BED entries to be merged
500 my ( $i, @q_ids, @q_begs, @blocksizes, @new_q_begs, @new_blocksizes, %new_entry );
502 @{ $entries } = sort { $a->{ "CHR_BEG" } <=> $b->{ "CHR_BEG" } } @{ $entries };
504 for ( $i = 0; $i < @{ $entries }; $i++ )
506 Maasha::Common::error( qq(Attempted merge of BED entries from different chromosomes) ) if $entries->[ 0 ]->{ "CHR" } ne $entries->[ $i ]->{ "CHR" };
507 Maasha::Common::error( qq(Attempted merge of BED entries from different strands) ) if $entries->[ 0 ]->{ "STRAND" } ne $entries->[ $i ]->{ "STRAND" };
509 push @q_ids, $entries->[ $i ]->{ "Q_ID" } || sprintf( "ID%06d", $i );
511 if ( exists $entries->[ $i ]->{ "Q_BEGS" } )
513 @q_begs = split ",", $entries->[ $i ]->{ "Q_BEGS" };
514 @blocksizes = split ",", $entries->[ $i ]->{ "BLOCK_LENS" };
519 @blocksizes = $entries->[ $i ]->{ "CHR_END" } - $entries->[ $i ]->{ "CHR_BEG" } + 1;
522 map { $_ += $entries->[ $i ]->{ "CHR_BEG" } } @q_begs;
524 push @new_q_begs, @q_begs;
525 push @new_blocksizes, @blocksizes;
528 map { $_ -= $entries->[ 0 ]->{ "CHR_BEG" } } @new_q_begs;
531 CHR => $entries->[ 0 ]->{ "CHR" },
532 CHR_BEG => $entries->[ 0 ]->{ "CHR_BEG" },
533 CHR_END => $entries->[ -1 ]->{ "CHR_END" },
535 BED_LEN => $entries->[ -1 ]->{ "CHR_END" } - $entries->[ 0 ]->{ "CHR_BEG" } + 1,
537 Q_ID => join( ":", @q_ids ),
539 STRAND => $entries->[ 0 ]->{ "STRAND" } || "+",
540 THICK_BEG => $entries->[ 0 ]->{ "THICK_BEG" } || $entries->[ 0 ]->{ "CHR_BEG" },
541 THICK_END => $entries->[ -1 ]->{ "THICK_END" } || $entries->[ -1 ]->{ "CHR_END" },
543 BLOCK_COUNT => scalar @new_q_begs,
544 BLOCK_LENS => join( ",", @new_blocksizes ),
545 Q_BEGS => join( ",", @new_q_begs ),
548 return wantarray ? %new_entry : \%new_entry;
554 # Martin A. Hansen, February 2008.
556 # Splits a given BED entry into a list of blocks,
557 # which are returned. A list of 6 column BED entry is returned.
559 my ( $entry, # BED entry hashref
564 my ( @q_begs, @blocksizes, $block, @blocks, $i );
566 if ( exists $entry->{ "BLOCK_COUNT" } )
568 @q_begs = split ",", $entry->{ "Q_BEGS" };
569 @blocksizes = split ",", $entry->{ "BLOCK_LENS" };
571 for ( $i = 0; $i < @q_begs; $i++ )
575 $block->{ "CHR" } = $entry->{ "CHR" };
576 $block->{ "CHR_BEG" } = $entry->{ "CHR_BEG" } + $q_begs[ $i ];
577 $block->{ "CHR_END" } = $entry->{ "CHR_BEG" } + $q_begs[ $i ] + $blocksizes[ $i ] - 1;
578 $block->{ "Q_ID" } = $entry->{ "Q_ID" } . sprintf( "_%03d", $i );
579 $block->{ "SCORE" } = $entry->{ "SCORE" };
580 $block->{ "STRAND" } = $entry->{ "STRAND" };
581 $block->{ "BED_LEN" } = $block->{ "CHR_END" } - $block->{ "CHR_BEG" } + 1,
582 $block->{ "BED_COLS" } = 6;
583 $block->{ "REC_TYPE" } = "BED";
585 push @blocks, $block;
590 @blocks = @{ $entry };
593 return wantarray ? @blocks : \@blocks;
600 # Martin A. Hansen, February 2008.
602 # Checks if two BED entries overlap and
603 # return 1 if so - else 0;
605 my ( $entry1, # hashref
607 $no_strand, # don't check strand flag - OPTIONAL
612 return 0 if $entry1->{ "CHR" } ne $entry2->{ "CHR" };
613 return 0 if $entry1->{ "STRAND" } ne $entry2->{ "STRAND" };
615 if ( $entry1->{ "CHR_END" } < $entry2->{ "CHR_BEG" } or $entry1->{ "CHR_BEG" } > $entry2->{ "CHR_END" } ) {
623 sub bed_upload_to_ucsc
625 # Martin A. Hansen, September 2007.
627 # Upload a BED file to the UCSC database.
629 my ( $tmp_dir, # temporary directory
630 $file, # file to upload,
631 $options, # argument hashref
632 $append, # flag indicating table should be appended
637 my ( $args, $table, $sql_file, $fh_out, $fh_in );
640 $args = join " ", $options->{ "database" }, $options->{ "table" }, "-tmpDir=$tmp_dir", "-oldTable", $file;
642 $args = join " ", $options->{ "database" }, $options->{ "table" }, "-tmpDir=$tmp_dir", $file;
645 if ( $options->{ "table" } =~ /rnaSecStr/ )
647 $table = $options->{ "table" };
649 print qq(uploading secondary structure table:"$table"\n) if $options->{ "verbose" };
651 $sql_file = "$tmp_dir/upload_RNA_SS.sql";
653 $fh_out = Maasha::Common::write_open( $sql_file );
656 CREATE TABLE $table (
657 bin smallint not null, # Bin number for browser speedup
658 chrom varchar(255) not null, # Chromosome or FPC contig
659 chromStart int unsigned not null, # Start position in chromosome
660 chromEnd int unsigned not null, # End position in chromosome
661 name varchar(255) not null, # Name of item
662 score int unsigned not null, # Score from 0-1000
663 strand char(1) not null, # + or -
664 size int unsigned not null, # Size of element.
665 secStr longblob not null, # Parentheses and '.'s which define the secondary structure
666 conf longblob not null, # Confidence of secondary-structure annotation per position (0.0-1.0).
669 INDEX(chrom(8), bin),
670 INDEX(chrom(8), chromStart)
676 Maasha::Common::run( "hgLoadBed", "-notItemRgb -sqlTable=$sql_file $options->{ 'database' } $options->{ 'table' } -tmpDir=$tmp_dir $file > /dev/null 2>&1" );
682 Maasha::Common::run( "hgLoadBed", "$args > /dev/null 2>&1" );
687 sub psl_upload_to_ucsc
689 # Martin A. Hansen, September 2007.
691 # Upload a PSL file to the UCSC database.
693 my ( $file, # file to upload,
694 $options, # argument hashref
695 $append, # flag indicating table should be appended
703 $args = join " ", $options->{ "database" }, "-table=$options->{ 'table' }", "-clientLoad", "-append", $file;
705 $args = join " ", $options->{ "database" }, "-table=$options->{ 'table' }", "-clientLoad", $file;
708 Maasha::Common::run( "hgLoadPsl", "$args > /dev/null 2>&1" );
712 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> TRACK FILE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
715 sub ucsc_config_entry_get
717 # Martin A. Hansen, November 2008.
719 # Given a filehandle to a UCSC Genome Browser
720 # config file (.ra) get the next entry and
721 # return as a hash. Entries are separated by blank
722 # lines. # lines are skipped unless it is the lines:
726 my ( $fh, # file hanlde
731 my ( $line, %record );
733 while ( $line = <$fh> )
737 if ( $line =~ /Track added by 'upload_to_ucsc' (\S+) (\S+)/) {
738 $record{ 'date' } = $1;
739 $record{ 'time' } = $2;
740 } elsif ( $line =~ /^# date: (.+)/ ) {
741 $record{ 'date' } = $1;
742 } elsif ( $line =~ /^# time: (.+)/ ) {
743 $record{ 'time' } = $1;
744 } elsif ( $line =~ /^# (?:database:|Database) (.+)/ ) {
745 $record{ 'database' } = $1;
746 } elsif ( $line =~ /^#/ ) {
748 } elsif ( $line =~ /(\S+)\s+(.+)/ ) {
752 last if $line =~ /^$/ and exists $record{ "track" };
755 return undef if not exists $record{ "track" };
757 return wantarray ? %record : \%record;
761 sub ucsc_config_entry_put
763 # Martin A. Hansen, November 2008.
765 # Outputs a Biopiece record (a hashref)
766 # to a filehandle or STDOUT.
768 my ( $record, # hashref
769 $fh_out, # file handle
776 $fh_out ||= \*STDOUT;
778 ( $date, $time ) = split " ", Maasha::Common::time_stamp();
780 $record->{ "date" } ||= $date;
781 $record->{ "time" } ||= $time;
783 print $fh_out "\n# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n\n";
785 map { print $fh_out "# $_: $record->{ $_ }\n" if exists $record->{ $_ } } qw( date time database );
786 map { print $fh_out "$_ $record->{ $_ }\n" if exists $record->{ $_ } } qw( track
807 sub ucsc_update_config
809 # Martin A. Hansen, September 2007.
811 # Update the /home/user/ucsc/my_tracks.ra file and executes makeCustomTracks.pl
813 my ( $options, # hashref
819 my ( $file, $entry, %new_entry, $fh_in, $fh_out, $was_found );
821 $file = $ENV{ "HOME" } . "/ucsc/my_tracks.ra";
823 Maasha::Filesys::file_copy( $file, "$file~" ); # create a backup
826 database => $options->{ 'database' },
827 track => $options->{ 'table' },
828 shortLabel => $options->{ 'short_label' },
829 longLabel => $options->{ 'long_label' },
830 group => $options->{ 'group' },
831 priority => $options->{ 'priority' },
832 visibility => $options->{ 'visibility' },
833 color => $options->{ 'color' },
837 $new_entry{ 'useScore' } = 1 if $options->{ 'use_score' };
838 $new_entry{ 'mafTrack' } = "multiz17way" if $type eq "type bed 6 +";
839 $new_entry{ 'maxHeightPixels' } = "50:50:11" if $type eq "wig 0";
841 $fh_in = Maasha::Filesys::file_read_open( "$file~" );
842 $fh_out = Maasha::Filesys::file_write_open( $file );
844 while ( $entry = ucsc_config_entry_get( $fh_in ) )
846 if ( $entry->{ 'database' } eq $new_entry{ 'database' } and $entry->{ 'track' } eq $new_entry{ 'track' } )
848 ucsc_config_entry_put( \%new_entry, $fh_out );
854 ucsc_config_entry_put( $entry, $fh_out );
858 ucsc_config_entry_put( \%new_entry, $fh_out ) if not $was_found;
863 Maasha::Common::run( "ucscMakeTracks.pl", "-b > /dev/null 2>&1" );
867 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> fixedStep format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
870 sub fixedstep_get_entry
872 # Martin A. Hansen, December 2007.
874 # Given a file handle to a PhastCons file get the
875 # next entry which is all the lines after a "fixedStep"
876 # line and until the next "fixedStep" line or EOF.
878 my ( $fh, # filehandle
881 # Returns a list of lines
883 my ( $entry, @lines );
885 local $/ = "\nfixedStep ";
891 @lines = split "\n", $entry;
893 return if @lines == 0;
895 $lines[ 0 ] =~ s/fixedStep?\s*//;
897 return wantarray ? @lines : \@lines;
901 sub fixedstep_index_create
903 # Martin A. Hansen, January 2008.
905 # Indexes a concatenated fixedStep file.
906 # The index consists of a hash with chromosomes as keys,
907 # and a list of [ chr_beg, next_chr_beg, chr_end, index_beg, index_len ] as values.
909 my ( $path, # path to fixedStep file
914 my ( $fh, $pos, $index_beg, $index_len, $entry, $locator, $chr, $step, $beg, $end, $len, %index, $i );
916 $fh = Maasha::Common::read_open( $path );
920 while ( $entry = Maasha::UCSC::fixedstep_get_entry( $fh ) )
922 $locator = shift @{ $entry };
924 if ( $locator =~ /chrom=([^ ]+) start=(\d+) step=(\d+)/ )
927 $beg = $2 - 1; # fixedStep files are 1-based
932 Maasha::Common::error( qq(Could not parse locator: $locator) );
935 $pos += length( $locator ) + 11;
939 # map { $pos += length( $_ ) + 1 } @{ $entry };
941 $pos += 6 * scalar @{ $entry };
943 $index_len = $pos - $index_beg;
945 push @{ $index{ $chr } }, [ $beg, undef, $beg + scalar @{ $entry } - 1, $index_beg, $index_len ];
950 foreach $chr ( keys %index )
952 for ( $i = 0; $i < @{ $index{ $chr } } - 1; $i++ ) {
953 $index{ $chr }->[ $i ]->[ FS_NEXT_CHR_BEG ] = $index{ $chr }->[ $i + 1 ]->[ 0 ];
956 $index{ $chr }->[ -1 ]->[ FS_NEXT_CHR_BEG ] = $index{ $chr }->[ -1 ]->[ FS_CHR_END ] + 1;
959 return wantarray ? %index : \%index;
963 sub fixedstep_index_store
965 # Martin A. Hansen, January 2008.
967 # Writes a fixedStep index to binary file.
969 my ( $path, # full path to file
970 $index, # list with index
975 Maasha::Common::file_store( $path, $index );
979 sub fixedstep_index_retrieve
981 # Martin A. Hansen, January 2008.
983 # Retrieves a fixedStep index from binary file.
985 my ( $path, # full path to file
992 $index = Maasha::Common::file_retrieve( $path );
994 return wantarray ? %{ $index } : $index;
998 sub fixedstep_index_lookup
1000 # Martin A. Hansen, January 2008.
1002 # Retrieve fixedStep scores from a indexed
1003 # fixedStep file given a chromosome and
1004 # begin and end positions.
1006 my ( $index, # data structure
1007 $fh, # filehandle to datafile
1009 $chr_beg, # chromosome beg
1010 $chr_end, # chromosome end
1011 $flank, # include flanking region - OPTIONAL
1016 my ( $index_beg, $index_end, $i, $c, $beg, $end, @vals, $scores );
1023 # print "chr_beg->$chr_beg chr_end->$chr_end flank->$flank\n";
1025 if ( exists $index->{ $chr } )
1027 $index_beg = Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_beg );
1029 if ( $index_beg < 0 ) {
1030 Maasha::Common::error( qq(Index search failed - begin index position doesn't exists: $chr_beg) );
1033 if ( $chr_end < $index->{ $chr }->[ $index_beg ]->[ 1 ] )
1035 $index_end = $index_beg;
1039 $index_end = Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_end );
1041 if ( $index_end < 0 ) {
1042 Maasha::Common::error( qq(Index search failed - end index position doesn't exists: $chr_end) );
1046 map { $scores->[ $_ ] = 0 } 0 .. $chr_end - $chr_beg;
1048 if ( $index_beg == $index_end )
1050 $beg = Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] );
1051 $end = Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] );
1053 if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] and $end >= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] )
1055 @vals = split "\n", Maasha::Common::file_read(
1057 $index->{ $chr }->[ $index_beg ]->[ FS_INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ),
1058 6 * ( $end - $beg + 1 ),
1062 for ( $c = 0; $c < @vals; $c++ ) {
1063 $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ];
1068 $beg = Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] );
1070 # print Dumper( $beg, $index->{ $chr }->[ $index_beg ] );
1071 # print Dumper( "next", $index->{ $chr }->[ $index_beg ]->[ FS_NEXT_CHR_BEG ] );
1077 if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] )
1079 @vals = split "\n", Maasha::Common::file_read(
1081 $index->{ $chr }->[ $index_beg ]->[ FS_INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ),
1082 6 * ( $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] - $beg + 1 ),
1085 for ( $c = 0; $c < @vals; $c++ ) {
1086 $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ];
1090 $end = Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] );
1092 if ( $end <= $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] )
1094 @vals = split "\n", Maasha::Common::file_read(
1096 $index->{ $chr }->[ $index_end ]->[ FS_INDEX_BEG ],
1097 6 * ( $end - $index->{ $chr }->[ $index_end ]->[ FS_CHR_BEG ] + 1 ),
1100 for ( $c = 0; $c < @vals; $c++ ) {
1101 $scores->[ $c + $index->{ $chr }->[ $index_end ]->[ FS_CHR_BEG ] - $chr_beg ] = $vals[ $c ];
1105 for ( $i = $index_beg + 1; $i <= $index_end - 1; $i++ )
1107 @vals = split "\n", Maasha::Common::file_read(
1109 $index->{ $chr }->[ $i ]->[ FS_INDEX_BEG ],
1110 6 * ( $index->{ $chr }->[ $i ]->[ FS_CHR_END ] - $index->{ $chr }->[ $i ]->[ FS_CHR_BEG ] + 1 ),
1113 for ( $c = 0; $c < @vals; $c++ ) {
1114 $scores->[ $c + $index->{ $chr }->[ $i ]->[ FS_CHR_BEG ] - $chr_beg ] = $vals[ $c ];
1121 Maasha::Common::error( qq(Chromosome "$chr" was not found in index) );
1124 return wantarray ? @{ $scores } : $scores;
1128 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PhastCons format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1133 # Martin A. Hansen, July 2008
1135 # Create a fixedStep index for PhastCons data.
1137 my ( $file, # file to index
1138 $dir, # dir with file
1145 $index = fixedstep_index_create( "$dir/$file" );
1147 fixedstep_index_store( "$dir/$file.index", $index );
1151 sub phastcons_parse_entry
1153 # Martin A. Hansen, December 2007.
1155 # Given a PhastCons entry converts this to a
1156 # list of super blocks.
1158 # $options->{ "min" } ||= 10;
1159 # $options->{ "dist" } ||= 25;
1160 # $options->{ "threshold" } ||= 0.8;
1161 # $options->{ "gap" } ||= 5;
1163 my ( $lines, # list of lines
1164 $args, # argument hash
1169 my ( $info, $chr, $beg, $step, $i, $c, $j, @blocks, @super_blocks, @entries, $super_block, $block, @lens, @begs );
1171 $info = shift @{ $lines };
1173 if ( $info =~ /^chrom=([^ ]+) start=(\d+) step=(\d+)$/ )
1179 die qq(ERROR: step size $step != 1 -> problem!\n) if $step != 1; # in an ideal world should would be fixed ...
1184 while ( $i < @{ $lines } )
1186 if ( $lines->[ $i ] >= $args->{ "threshold" } )
1190 while ( $c < @{ $lines } )
1192 if ( $lines->[ $c ] < $args->{ "threshold" } )
1196 while ( $j < @{ $lines } and $lines->[ $j ] < $args->{ "threshold" } ) {
1200 if ( $j - $c > $args->{ "gap" } )
1202 if ( $c - $i >= $args->{ "min" } )
1206 CHR_BEG => $beg + $i - 1,
1207 CHR_END => $beg + $c - 2,
1225 if ( $c - $i >= $args->{ "min" } )
1229 CHR_BEG => $beg + $i - 1,
1230 CHR_END => $beg + $c - 2,
1245 while ( $i < @blocks )
1249 while ( $c < @blocks and $blocks[ $c ]->{ "CHR_BEG" } - $blocks[ $c - 1 ]->{ "CHR_END" } <= $args->{ "dist" } )
1254 push @super_blocks, [ @blocks[ $i .. $c - 1 ] ];
1259 foreach $super_block ( @super_blocks )
1261 foreach $block ( @{ $super_block } )
1263 push @begs, $block->{ "CHR_BEG" } - $super_block->[ 0 ]->{ "CHR_BEG" };
1264 push @lens, $block->{ "CHR_LEN" } - 1;
1270 CHR => $super_block->[ 0 ]->{ "CHR" },
1271 CHR_BEG => $super_block->[ 0 ]->{ "CHR_BEG" },
1272 CHR_END => $super_block->[ -1 ]->{ "CHR_END" },
1276 THICK_BEG => $super_block->[ 0 ]->{ "CHR_BEG" },
1277 THICK_END => $super_block->[ -1 ]->{ "CHR_END" } + 1,
1279 BLOCK_COUNT => scalar @{ $super_block },
1280 BLOCK_LENS => join( ",", @lens ),
1281 Q_BEGS => join( ",", @begs ),
1288 return wantarray ? @entries : \@entries;
1292 sub phastcons_normalize
1294 # Martin A. Hansen, January 2008.
1296 # Normalizes a list of lists with PhastCons scores,
1297 # in such a way that each list contains the same number
1298 # of PhastCons scores.
1300 my ( $AoA, # AoA with PhastCons scores
1305 my ( $list, $max, $min, $mean, $diff );
1310 foreach $list ( @{ $AoA } )
1312 $min = scalar @{ $list } if scalar @{ $list } < $min;
1313 $max = scalar @{ $list } if scalar @{ $list } > $max;
1316 $mean = int( ( $min + $max ) / 2 );
1318 # print STDERR "min->$min max->$max mean->$mean\n";
1320 foreach $list ( @{ $AoA } )
1322 $diff = scalar @{ $list } - $mean;
1324 phastcons_list_inflate( $list, abs( $diff ) ) if $diff < 0;
1325 phastcons_list_deflate( $list, $diff ) if $diff > 0;
1328 return wantarray ? @{ $AoA } : $AoA;
1332 sub phastcons_list_inflate
1334 # Martin A. Hansen, January 2008.
1336 # Inflates a list with a given number of elements
1337 # in such a way that the extra elements are introduced
1338 # evenly over the entire length of the list. The value
1339 # of the extra elements is based on a mean of the
1340 # adjacent elements.
1342 my ( $list, # list of elements
1343 $diff, # number of elements to introduce
1348 my ( $len, $space, $i, $pos );
1350 $len = scalar @{ $list };
1352 $space = $len / $diff;
1354 for ( $i = 0; $i < $diff; $i++ )
1356 $pos = int( ( $space / 2 ) + $i * $space );
1358 splice @{ $list }, $pos, 0, ( $list->[ $pos - 1 ] + $list->[ $pos + 1 ] ) / 2;
1359 # splice @{ $list }, $pos, 0, "X";
1362 die qq(ERROR: Bad inflate\n) if scalar @{ $list } != $len + $diff;
1366 sub phastcons_list_deflate
1368 # Martin A. Hansen, January 2008.
1370 # Deflates a list by removing a given number of elements
1371 # evenly distributed over the entire list.
1373 my ( $list, # list of elements
1374 $diff, # number of elements to remove
1379 my ( $len, $space, $i, $pos );
1381 $len = scalar @{ $list };
1383 $space = ( $len - $diff ) / $diff;
1385 for ( $i = 0; $i < $diff; $i++ )
1387 $pos = int( ( $space / 2 ) + $i * $space );
1389 splice @{ $list }, $pos, 1;
1392 die qq(ERROR: Dad deflate\n) if scalar @{ $list } != $len - $diff;
1398 # Martin A. Hansen, January 2008.
1400 # Given a normalized PhastCons matrix in an AoA,
1401 # calculate the mean for each column and return as a list.
1403 my ( $AoA, # AoA with normalized PhastCons scores
1410 $AoA = Maasha::Matrix::matrix_flip( $AoA );
1412 map { push @list, Maasha::Calc::mean( $_ ) } @{ $AoA };
1414 return wantarray ? @list : \@list;
1418 sub phastcons_median
1420 # Martin A. Hansen, January 2008.
1422 # Given a normalized PhastCons matrix in an AoA,
1423 # calculate the median for each column and return as a list.
1425 my ( $AoA, # AoA with normalized PhastCons scores
1432 $AoA = Maasha::Matrix::matrix_flip( $AoA );
1434 map { push @list, Maasha::Calc::median( $_ ) } @{ $AoA };
1436 return wantarray ? @list : \@list;
1440 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MULTIPLE ALIGNMENT FILES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1445 # Martin A. Hansen, April 2008.
1447 # Executes mafFrag to extract a subalignment from a multiz track
1448 # in the UCSC genome browser database.
1450 my ( $tmp_dir, # temporary directory
1451 $database, # genome database
1452 $table, # table with the multiz track
1454 $beg, # begin position
1455 $end, # end position
1459 # Returns a list of record
1461 my ( $tmp_file, $align );
1463 $tmp_file = "$tmp_dir/maf_extract.maf";
1465 Maasha::Common::run( "mafFrag", "$database $table $chr $beg $end $strand $tmp_file" );
1467 $align = maf_parse( $tmp_file );
1471 return wantarray ? @{ $align } : $align;
1477 # Martin A. Hansen, April 2008.
1480 my ( $path, # full path to MAF file
1483 # Returns a list of record.
1485 my ( $fh, $line, @fields, @align );
1487 $fh = Maasha::Common::read_open( $path );
1489 while ( $line = <$fh> )
1493 if ( $line =~ /^s/ )
1495 @fields = split / /, $line;
1498 SEQ_NAME => $fields[ 1 ],
1499 SEQ => $fields[ -1 ],
1501 ALIGN_LEN => length $fields[ -1 ],
1508 return wantarray ? @align : \@align;
1512 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> WIGGLE FORMAT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1515 sub fixedstep_put_entry
1517 # Martin A. Hansen, April 2008.
1519 # Outputs a block of fixedStep values.
1520 # Used for outputting wiggle data.
1522 my ( $chr, # chromosome
1523 $beg, # start position
1524 $block, # list of scores
1525 $fh, # filehandle - OPTIONAL
1526 $log10, # flag indicating that log10 scores should be used
1531 $beg += 1; # fixedStep format is 1 based.
1535 print $fh "fixedStep chrom=$chr start=$beg step=1\n";
1538 map { printf( $fh "%f\n", Maasha::Calc::log10( $_ + 1 ) ) } @{ $block };
1540 map { printf( $fh "%d\n", ( $_ + 1 ) ) } @{ $block };
1545 sub wiggle_upload_to_ucsc
1547 # Martin A. Hansen, May 2008.
1549 # Upload a wiggle file to the UCSC database.
1551 my ( $tmp_dir, # temporary directory
1552 $wib_dir, # wib directory
1553 $wig_file, # file to upload,
1554 $options, # argument hashref
1561 # $args = join " ", "-tmpDir=$tmp_dir", "-pathPrefix=$wib_dir", $options->{ "database" }, $options->{ 'table' }, $wig_file;
1563 # Maasha::Common::run( "hgLoadWiggle", "$args > /dev/null 2>&1" );
1565 if ( $options->{ 'verbose' } ) {
1566 `cd $tmp_dir && hgLoadWiggle -tmpDir=$tmp_dir -pathPrefix=$wib_dir $options->{ 'database' } $options->{ 'table' } $wig_file`;
1568 `cd $tmp_dir && hgLoadWiggle -tmpDir=$tmp_dir -pathPrefix=$wib_dir $options->{ 'database' } $options->{ 'table' } $wig_file > /dev/null 2>&1`;
1573 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MySQL CONF <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1578 # Martin A. Hansen, May 2008
1580 # Fetches the MySQL database user name from the
1581 # .hg.conf file in the users home directory.
1585 my ( $file, $fh, $line, $user );
1587 $file = "$ENV{ 'HOME' }/.hg.conf";
1589 return if not -f $file;
1591 $fh = Maasha::Common::read_open( $file );
1593 while ( $line = <$fh> )
1597 if ( $line =~ /^db\.user=(.+)/ )
1611 sub ucsc_get_password
1613 # Martin A. Hansen, May 2008
1615 # Fetches the MySQL database password from the
1616 # .hg.conf file in the users home directory.
1620 my ( $file, $fh, $line, $password );
1622 $file = "$ENV{ 'HOME' }/.hg.conf";
1624 return if not -f $file;
1626 $fh = Maasha::Common::read_open( $file );
1628 while ( $line = <$fh> )
1632 if ( $line =~ /^db\.password=(.+)/ )
1646 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<