3 # Copyright (C) 2007 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Stuff for interacting with UCSC genome browser
28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
32 use vars qw ( @ISA @EXPORT );
35 use Time::HiRes qw( gettimeofday );
49 @ISA = qw( Exporter );
51 my $TIME = gettimeofday();
54 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BED format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
57 # http://genome.ucsc.edu/goldenPath/help/hgTracksHelp.html#BED
62 # Martin A. Hansen, December 2007.
64 # Reads a bed entry given a filehandle.
66 my ( $fh, # file handle
67 $columns, # number of BED columns to read - OPTIONAL
72 my ( $line, @fields, %entry );
76 $line =~ s/(\n|\r)$//g; # some people have carriage returns in their BED files -> Grrrr
78 return if not defined $line;
80 @fields = split "\t", $line;
82 $columns ||= scalar @fields;
87 "CHR" => $fields[ 0 ],
88 "CHR_BEG" => $fields[ 1 ],
89 "CHR_END" => $fields[ 2 ] - 1,
92 elsif ( $columns == 4 )
95 "CHR" => $fields[ 0 ],
96 "CHR_BEG" => $fields[ 1 ],
97 "CHR_END" => $fields[ 2 ] - 1,
98 "Q_ID" => $fields[ 3 ],
101 elsif ( $columns == 5 )
104 "CHR" => $fields[ 0 ],
105 "CHR_BEG" => $fields[ 1 ],
106 "CHR_END" => $fields[ 2 ] - 1,
107 "Q_ID" => $fields[ 3 ],
108 "SCORE" => $fields[ 4 ],
111 elsif ( $columns == 6 )
114 "CHR" => $fields[ 0 ],
115 "CHR_BEG" => $fields[ 1 ],
116 "CHR_END" => $fields[ 2 ] - 1,
117 "Q_ID" => $fields[ 3 ],
118 "SCORE" => $fields[ 4 ],
119 "STRAND" => $fields[ 5 ],
122 elsif ( $columns == 12 )
125 "CHR" => $fields[ 0 ],
126 "CHR_BEG" => $fields[ 1 ],
127 "CHR_END" => $fields[ 2 ] - 1,
128 "Q_ID" => $fields[ 3 ],
129 "SCORE" => $fields[ 4 ],
130 "STRAND" => $fields[ 5 ],
131 "THICK_BEG" => $fields[ 6 ],
132 "THICK_END" => $fields[ 7 ] - 1,
133 "ITEMRGB" => $fields[ 8 ],
134 "BLOCKCOUNT" => $fields[ 9 ],
135 "BLOCKSIZES" => $fields[ 10 ],
136 "Q_BEGS" => $fields[ 11 ],
141 &Maasha::Common::error( qq(Bad BED format in line->$line<-) );
144 $entry{ "REC_TYPE" } = "BED";
145 $entry{ "BED_LEN" } = $entry{ "CHR_END" } - $entry{ "CHR_BEG" } + 1;
146 $entry{ "BED_COLS" } = $columns;
148 return wantarray ? %entry : \%entry;
154 # Martin A. Hansen, January 2008.
156 # Given a path to a BED file, read in all entries
159 my ( $path, # full path to BED file
160 $columns, # number of columns in BED file - OPTIONAL (but is faster)
165 my ( $fh, $entry, @list );
167 $fh = &Maasha::Common::read_open( $path );
169 while ( $entry = &bed_get_entry( $fh ) ) {
175 return wantarray ? @list : \@list;
181 # Martin A. Hansen, Septermber 2007.
183 # Writes a BED entry to file.
185 # NB, this could really be more robust!?
187 my ( $record, # hashref
188 $fh, # file handle - OPTIONAL
189 $columns, # number of columns in BED file - OPTIONAL (but is faster)
196 $columns ||= 12; # max number of columns possible
200 push @fields, $record->{ "CHR" };
201 push @fields, $record->{ "CHR_BEG" };
202 push @fields, $record->{ "CHR_END" } + 1;
204 elsif ( $columns == 4 )
206 $record->{ "Q_ID" } =~ s/\s+/_/g;
208 push @fields, $record->{ "CHR" };
209 push @fields, $record->{ "CHR_BEG" };
210 push @fields, $record->{ "CHR_END" } + 1;
211 push @fields, $record->{ "Q_ID" };
213 elsif ( $columns == 5 )
215 $record->{ "Q_ID" } =~ s/\s+/_/g;
216 $record->{ "SCORE" } =~ s/\.\d*//;
218 push @fields, $record->{ "CHR" };
219 push @fields, $record->{ "CHR_BEG" };
220 push @fields, $record->{ "CHR_END" } + 1;
221 push @fields, $record->{ "Q_ID" };
222 push @fields, $record->{ "SCORE" };
224 elsif ( $columns == 6 )
226 $record->{ "Q_ID" } =~ s/\s+/_/g;
227 $record->{ "SCORE" } =~ s/\.\d*//;
229 push @fields, $record->{ "CHR" };
230 push @fields, $record->{ "CHR_BEG" };
231 push @fields, $record->{ "CHR_END" } + 1;
232 push @fields, $record->{ "Q_ID" };
233 push @fields, $record->{ "SCORE" };
234 push @fields, $record->{ "STRAND" };
238 $record->{ "Q_ID" } =~ s/\s+/_/g;
239 $record->{ "SCORE" } =~ s/\.\d*//;
241 push @fields, $record->{ "CHR" };
242 push @fields, $record->{ "CHR_BEG" };
243 push @fields, $record->{ "CHR_END" } + 1;
244 push @fields, $record->{ "Q_ID" };
245 push @fields, $record->{ "SCORE" };
246 push @fields, $record->{ "STRAND" };
247 push @fields, $record->{ "THICK_BEG" } if defined $record->{ "THICK_BEG" };
248 push @fields, $record->{ "THICK_END" } + 1 if defined $record->{ "THICK_END" };
249 push @fields, $record->{ "ITEMRGB" } if defined $record->{ "ITEMRGB" };
250 push @fields, $record->{ "BLOCKCOUNT" } if defined $record->{ "BLOCKCOUNT" };
251 push @fields, $record->{ "BLOCKSIZES" } if defined $record->{ "BLOCKSIZES" };
252 push @fields, $record->{ "Q_BEGS" } if defined $record->{ "Q_BEGS" };
256 print $fh join( "\t", @fields ), "\n";
258 print join( "\t", @fields ), "\n";
265 # Martin A. Hansen, January 2008.
267 # Write a list of BED entries.
269 my ( $entries, # list of entries,
270 $fh, # file handle - OPTIONAL
275 map { &bed_put_entry( $_, $fh ) } @{ $entries };
281 # Martin A. Hansen, March 2008.
283 # Given a bed record, analysis this to give information
284 # about intron/exon sizes.
286 my ( $entry, # BED entry
291 my ( $i, @begs, @lens, $exon_max, $exon_min, $exon_len, $exon_tot, $intron_max, $intron_min, $intron_len, $intron_tot );
294 $exon_min = 9999999999;
296 $intron_min = 9999999999;
298 $entry->{ "EXONS" } = $entry->{ "BLOCKCOUNT" };
300 @begs = split /,/, $entry->{ "Q_BEGS" };
301 @lens = split /,/, $entry->{ "BLOCKSIZES" };
303 for ( $i = 0; $i < $entry->{ "BLOCKCOUNT" }; $i++ )
305 $exon_len = @lens[ $i ];
307 $entry->{ "EXON_LEN_$i" } = $exon_len;
309 $exon_max = $exon_len if $exon_len > $exon_max;
310 $exon_min = $exon_len if $exon_len < $exon_min;
312 $exon_tot += $exon_len;
315 $entry->{ "EXON_LEN_-1" } = $exon_len;
316 $entry->{ "EXON_MAX_LEN" } = $exon_max;
317 $entry->{ "EXON_MIN_LEN" } = $exon_min;
318 $entry->{ "EXON_MEAN_LEN" } = int( $exon_tot / $entry->{ "EXONS" } );
320 $entry->{ "INTRONS" } = $entry->{ "BLOCKCOUNT" } - 1;
321 $entry->{ "INTRONS" } = 0 if $entry->{ "INTRONS" } < 0;
323 if ( $entry->{ "INTRONS" } )
325 for ( $i = 1; $i < $entry->{ "BLOCKCOUNT" }; $i++ )
327 $intron_len = @begs[ $i ] - ( @begs[ $i - 1 ] + @lens[ $i - 1 ] );
329 $entry->{ "INTRON_LEN_" . ( $i - 1 ) } = $intron_len;
331 $intron_max = $intron_len if $intron_len > $intron_max;
332 $intron_min = $intron_len if $intron_len < $intron_min;
334 $intron_tot += $intron_len;
337 $entry->{ "INTRON_LEN_-1" } = $intron_len;
338 $entry->{ "INTRON_MAX_LEN" } = $intron_max;
339 $entry->{ "INTRON_MIN_LEN" } = $intron_min;
340 $entry->{ "INTRON_MEAN_LEN" } = int( $intron_tot / $entry->{ "INTRONS" } );
343 return wantarray ? %{ $entry } : $entry;
349 # Martin A. Hansen, March 2008.
351 # Sort a potential huge BED file according to
352 # CHR, CHR_BEG and optionally STRAND.
354 my ( $tmp_dir, # temporary directory used for sorting
355 $file, # BED file to sort
356 $strand, # flag to sort on strand - OPTIONAL
361 my ( $fh_in, $key, $fh_out, %fh_hash, $part_file, $entry, $entries );
363 $fh_in = &Maasha::Common::read_open( $file );
365 while ( $entry = &bed_get_entry( $fh_in ) )
368 $key = join "_", $entry->{ "CHR" }, $entry->{ "STRAND" };
370 $key = $entry->{ "CHR" };
373 $fh_hash{ $key } = &Maasha::Common::write_open( "$tmp_dir/$key.sort" ) if not exists $fh_hash{ $key };
375 &bed_put_entry( $entry, $fh_hash{ $key } );
380 map { close $_ } keys %fh_hash;
382 $fh_out = &Maasha::Common::write_open( "$tmp_dir/temp.sort" );
384 foreach $part_file ( sort keys %fh_hash )
386 $entries = &bed_get_entries( "$tmp_dir/$part_file.sort" );
388 @{ $entries } = sort { $a->{ "CHR_BEG" } <=> $b->{ "CHR_BEG" } } @{ $entries };
390 map { &bed_put_entry( $_, $fh_out ) } @{ $entries };
392 unlink "$tmp_dir/$part_file.sort";
397 rename "$tmp_dir/temp.sort", $file;
401 sub bed_merge_entries
403 # Martin A. Hansen, February 2008.
405 # Merge a list of given BED entries in one big entry.
407 my ( $entries, # list of BED entries to be merged
412 my ( $i, @q_ids, @q_begs, @blocksizes, @new_q_begs, @new_blocksizes, %new_entry );
414 @{ $entries } = sort { $a->{ "CHR_BEG" } <=> $b->{ "CHR_BEG" } } @{ $entries };
416 for ( $i = 0; $i < @{ $entries }; $i++ )
418 &Maasha::Common::error( qq(Attempted merge of BED entries from different chromosomes) ) if $entries->[ 0 ]->{ "CHR" } ne $entries->[ $i ]->{ "CHR" };
419 &Maasha::Common::error( qq(Attempted merge of BED entries from different strands) ) if $entries->[ 0 ]->{ "STRAND" } ne $entries->[ $i ]->{ "STRAND" };
421 push @q_ids, $entries->[ $i ]->{ "Q_ID" } || sprintf( "ID%06d", $i );
423 if ( exists $entries->[ $i ]->{ "Q_BEGS" } )
425 @q_begs = split ",", $entries->[ $i ]->{ "Q_BEGS" };
426 @blocksizes = split ",", $entries->[ $i ]->{ "BLOCKSIZES" };
431 @blocksizes = $entries->[ $i ]->{ "CHR_END" } - $entries->[ $i ]->{ "CHR_BEG" } + 1;
434 map { $_ += $entries->[ $i ]->{ "CHR_BEG" } } @q_begs;
436 push @new_q_begs, @q_begs;
437 push @new_blocksizes, @blocksizes;
440 map { $_ -= $entries->[ 0 ]->{ "CHR_BEG" } } @new_q_begs;
443 CHR => $entries->[ 0 ]->{ "CHR" },
444 CHR_BEG => $entries->[ 0 ]->{ "CHR_BEG" },
445 CHR_END => $entries->[ -1 ]->{ "CHR_END" },
447 BED_LEN => $entries->[ -1 ]->{ "CHR_END" } - $entries->[ 0 ]->{ "CHR_BEG" } + 1,
449 Q_ID => join( ":", @q_ids ),
451 STRAND => $entries->[ 0 ]->{ "STRAND" } || "+",
452 THICK_BEG => $entries->[ 0 ]->{ "THICK_BEG" } || $entries->[ 0 ]->{ "CHR_BEG" },
453 THICK_END => $entries->[ -1 ]->{ "THICK_END" } || $entries->[ -1 ]->{ "CHR_END" },
455 BLOCKCOUNT => scalar @new_q_begs,
456 BLOCKSIZES => join( ",", @new_blocksizes ),
457 Q_BEGS => join( ",", @new_q_begs ),
460 return wantarray ? %new_entry : \%new_entry;
466 # Martin A. Hansen, February 2008.
468 # Splits a given BED entry into a list of blocks,
469 # which are returned. A list of 6 column BED entry is returned.
471 my ( $entry, # BED entry hashref
476 my ( @q_begs, @blocksizes, $block, @blocks, $i );
478 if ( exists $entry->{ "BLOCKCOUNT" } )
480 @q_begs = split ",", $entry->{ "Q_BEGS" };
481 @blocksizes = split ",", $entry->{ "BLOCKSIZES" };
483 for ( $i = 0; $i < @q_begs; $i++ )
487 $block->{ "CHR" } = $entry->{ "CHR" };
488 $block->{ "CHR_BEG" } = $entry->{ "CHR_BEG" } + $q_begs[ $i ];
489 $block->{ "CHR_END" } = $entry->{ "CHR_BEG" } + $q_begs[ $i ] + $blocksizes[ $i ] - 1;
490 $block->{ "Q_ID" } = $entry->{ "Q_ID" } . sprintf( "_%03d", $i );
491 $block->{ "SCORE" } = $entry->{ "SCORE" };
492 $block->{ "STRAND" } = $entry->{ "STRAND" };
493 $block->{ "BED_LEN" } = $block->{ "CHR_END" } - $block->{ "CHR_BEG" } + 1,
494 $block->{ "BED_COLS" } = 6;
495 $block->{ "REC_TYPE" } = "BED";
497 push @blocks, $block;
502 @blocks = @{ $entry };
505 return wantarray ? @blocks : \@blocks;
512 # Martin A. Hansen, February 2008.
514 # Checks if two BED entries overlap and
515 # return 1 if so - else 0;
517 my ( $entry1, # hashref
519 $no_strand, # don't check strand flag - OPTIONAL
524 return 0 if $entry1->{ "CHR" } ne $entry2->{ "CHR" };
525 return 0 if $entry1->{ "STRAND" } ne $entry2->{ "STRAND" };
527 if ( $entry1->{ "CHR_END" } < $entry2->{ "CHR_BEG" } or $entry1->{ "CHR_BEG" } > $entry2->{ "CHR_END" } ) {
535 sub bed_upload_to_ucsc
537 # Martin A. Hansen, September 2007.
539 # Upload a BED file to the UCSC database.
541 my ( $tmp_dir, # temporary directory
542 $file, # file to upload,
543 $options, # argument hashref
544 $append, # flag indicating table should be appended
549 my ( $args, $table, $sql_file, $fh_out, $fh_in );
552 $args = join " ", $options->{ "database" }, $options->{ "table" }, "-tmpDir=$tmp_dir", "-oldTable", $file;
554 $args = join " ", $options->{ "database" }, $options->{ "table" }, "-tmpDir=$tmp_dir", $file;
557 if ( $options->{ "sec_struct" } )
559 $table = $options->{ "table" };
561 &Maasha::Common::error( "Attempt to load secondary structure track without 'rnaSecStr' in table name" ) if not $table =~ /rnaSecStr/;
563 $sql_file = "$tmp_dir/upload_RNA_SS.sql";
565 $fh_out = &Maasha::Common::write_open( $sql_file );
568 CREATE TABLE $table (
569 bin smallint not null, # Bin number for browser speedup
570 chrom varchar(255) not null, # Chromosome or FPC contig
571 chromStart int unsigned not null, # Start position in chromosome
572 chromEnd int unsigned not null, # End position in chromosome
573 name varchar(255) not null, # Name of item
574 score int unsigned not null, # Score from 0-1000
575 strand char(1) not null, # + or -
576 size int unsigned not null, # Size of element.
577 secStr longblob not null, # Parentheses and '.'s which define the secondary structure
578 conf longblob not null, # Confidence of secondary-structure annotation per position (0.0-1.0).
581 INDEX(chrom(8), bin),
582 INDEX(chrom(8), chromStart)
588 &Maasha::Common::run( "hgLoadBed", "-notItemRgb -sqlTable=$sql_file $options->{ 'database' } $options->{ 'table' } -tmpDir=$tmp_dir $file > /dev/null 2>&1" );
594 &Maasha::Common::run( "hgLoadBed", "$args > /dev/null 2>&1" );
599 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PSL format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
604 # Martin A. Hansen, February 2008.
606 # Reads PSL entries and returns a record.
608 my ( $path, # full path to PSL file
613 my ( $fh, @lines, @fields, $i, %record, @records );
615 $fh = &Maasha::Common::read_open( $path );
623 for ( $i = 5; $i < @lines; $i++ )
625 @fields = split "\t", $lines[ $i ];
627 &Maasha::Common::error( qq(Bad PSL format in file "$path") ) if not @fields == 21;
633 MATCHES => $fields[ 0 ],
634 MISMATCHES => $fields[ 1 ],
635 REPMATCHES => $fields[ 2 ],
636 NCOUNT => $fields[ 3 ],
637 QNUMINSERT => $fields[ 4 ],
638 QBASEINSERT => $fields[ 5 ],
639 SNUMINSERT => $fields[ 6 ],
640 SBASEINSERT => $fields[ 7 ],
641 STRAND => $fields[ 8 ],
642 Q_ID => $fields[ 9 ],
643 Q_LEN => $fields[ 10 ],
644 Q_BEG => $fields[ 11 ],
645 Q_END => $fields[ 12 ] - 1,
646 S_ID => $fields[ 13 ],
647 S_LEN => $fields[ 14 ],
648 S_BEG => $fields[ 15 ],
649 S_END => $fields[ 16 ] - 1,
650 BLOCKCOUNT => $fields[ 17 ],
651 BLOCKSIZES => $fields[ 18 ],
652 Q_BEGS => $fields[ 19 ],
653 S_BEGS => $fields[ 20 ],
656 $record{ "SCORE" } = $record{ "MATCHES" } + int( $record{ "REPMATCHES" } / 2 ) - $record{ "MISMATCHES" } - $record{ "QNUMINSERT" } - $record{ "SNUMINSERT" };
658 push @records, { %record };
661 return wantarray ? @records : \@records;
667 # Martin A. Hansen, September 2007.
669 # Write a PSL header to file.
671 my ( $fh, # file handle - OPTIONAL
676 $fh = \*STDOUT if not $fh;
678 print $fh qq(psLayout version 3
679 match mis- rep. N's Q gap Q gap T gap T gap strand Q Q Q Q T T T T block blockSizes qStart match match count bases count bases name size start end name size start end count
680 ---------------------------------------------------------------------------------------------------------------------------------------------------------------
687 # Martin A. Hansen, September 2007.
689 # Write a PSL entry to file.
691 my ( $record, # hashref
692 $fh, # file handle - OPTIONAL
697 $fh = \*STDOUT if not $fh;
701 push @output, $record->{ "MATCHES" };
702 push @output, $record->{ "MISMATCHES" };
703 push @output, $record->{ "REPMATCHES" };
704 push @output, $record->{ "NCOUNT" };
705 push @output, $record->{ "QNUMINSERT" };
706 push @output, $record->{ "QBASEINSERT" };
707 push @output, $record->{ "SNUMINSERT" };
708 push @output, $record->{ "SBASEINSERT" };
709 push @output, $record->{ "STRAND" };
710 push @output, $record->{ "Q_ID" };
711 push @output, $record->{ "Q_LEN" };
712 push @output, $record->{ "Q_BEG" };
713 push @output, $record->{ "Q_END" } + 1;
714 push @output, $record->{ "S_ID" };
715 push @output, $record->{ "S_LEN" };
716 push @output, $record->{ "S_BEG" };
717 push @output, $record->{ "S_END" } + 1;
718 push @output, $record->{ "BLOCKCOUNT" };
719 push @output, $record->{ "BLOCKSIZES" };
720 push @output, $record->{ "Q_BEGS" };
721 push @output, $record->{ "S_BEGS" };
723 print $fh join( "\t", @output ), "\n";
727 sub psl_upload_to_ucsc
729 # Martin A. Hansen, September 2007.
731 # Upload a PSL file to the UCSC database.
733 my ( $file, # file to upload,
734 $options, # argument hashref
735 $append, # flag indicating table should be appended
743 $args = join " ", $options->{ "database" }, "-table=$options->{ 'table' }", "-clientLoad", "-append", $file;
745 $args = join " ", $options->{ "database" }, "-table=$options->{ 'table' }", "-clientLoad", $file;
748 &Maasha::Common::run( "hgLoadPsl", "$args > /dev/null 2>&1" );
752 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> TRACK FILE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
757 # Martin A. Hansen, September 2007.
759 # Update the /home/user/ucsc/my_tracks.ra file and executes makeCustomTracks.pl
761 my ( $options, # hashref
767 my ( $file, $fh_in, $fh_out, $line, $time );
769 $file = $ENV{ "HOME" } . "/ucsc/my_tracks.ra";
771 # ---- create a backup ----
773 $fh_in = &Maasha::Common::read_open( $file );
774 $fh_out = &Maasha::Common::write_open( "$file~" );
776 while ( $line = <$fh_in> ) {
783 # ---- append track ----
785 $time = &Maasha::Common::time_stamp();
787 $fh_out = &Maasha::Common::append_open( $file );
789 if ( $type eq "sec_struct" )
791 print $fh_out "\n\n# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n";
793 print $fh_out "\n# Track added by 'upload_to_ucsc' $time\n\n";
795 print $fh_out "# Database $options->{ 'database' }\n\n";
797 print $fh_out "track $options->{ 'table' }\n";
798 print $fh_out "shortLabel $options->{ 'short_label' }\n";
799 print $fh_out "longLabel $options->{ 'long_label' }\n";
800 print $fh_out "group $options->{ 'group' }\n";
801 print $fh_out "priority $options->{ 'priority' }\n";
802 print $fh_out "visibility $options->{ 'visibility' }\n";
803 print $fh_out "color $options->{ 'color' }\n";
804 print $fh_out "type bed 6 +\n";
805 print $fh_out "mafTrack multiz17way\n";
807 print $fh_out "\n# //\n";
811 print $fh_out "\n\n# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n";
813 print $fh_out "\n# Track added by 'upload_to_ucsc' $time\n\n";
815 print $fh_out "# Database $options->{ 'database' }\n\n";
817 print $fh_out "track $options->{ 'table' }\n";
818 print $fh_out "shortLabel $options->{ 'short_label' }\n";
819 print $fh_out "longLabel $options->{ 'long_label' }\n";
820 print $fh_out "group $options->{ 'group' }\n";
821 print $fh_out "priority $options->{ 'priority' }\n";
822 print $fh_out "useScore 1\n" if $options->{ 'use_score' };
823 print $fh_out "visibility $options->{ 'visibility' }\n";
824 print $fh_out "maxHeightPixels 50:50:11\n" if $type eq "wig 0";
825 print $fh_out "color $options->{ 'color' }\n";
826 print $fh_out "type $type\n";
828 print $fh_out "\n# //\n";
833 &Maasha::Common::run( "ucscMakeTracks.pl", "-b > /dev/null 2>&1" );
837 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PhastCons format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
840 sub phastcons_get_entry
842 # Martin A. Hansen, December 2007.
844 # Given a file handle to a PhastCons file get the
845 # next entry which is all the lines after a "fixedStep"
846 # line and until the next "fixedStep" line or EOF.
848 my ( $fh, # filehandle
851 # Returns a list of lines
853 my ( $entry, @lines );
855 local $/ = "\nfixedStep ";
861 @lines = split "\n", $entry;
863 return if @lines == 0;
865 $lines[ 0 ] =~ s/fixedStep?\s*//;
867 return wantarray ? @lines : \@lines;
871 sub phastcons_parse_entry
873 # Martin A. Hansen, December 2007.
875 # Given a PhastCons entry converts this to a
876 # list of super blocks.
878 my ( $lines, # list of lines
879 $args, # argument hash
884 my ( $info, $chr, $beg, $step, $i, $c, $j, @blocks, @super_blocks, @entries, $super_block, $block, @lens, @begs );
886 $info = shift @{ $lines };
888 if ( $info =~ /^chrom=([^ ]+) start=(\d+) step=(\d+)$/ )
894 die qq(ERROR: step size $step != 1 -> problem!\n) if $step != 1; # in an ideal world should would be fixed ...
899 while ( $i < @{ $lines } )
901 if ( $lines->[ $i ] >= $args->{ "threshold" } )
905 while ( $c < @{ $lines } )
907 if ( $lines->[ $c ] < $args->{ "threshold" } )
911 while ( $j < @{ $lines } and $lines->[ $j ] < $args->{ "threshold" } ) {
915 if ( $j - $c > $args->{ "gap" } )
917 if ( $c - $i >= $args->{ "min" } )
921 CHR_BEG => $beg + $i - 1,
922 CHR_END => $beg + $c - 2,
940 if ( $c - $i >= $args->{ "min" } )
944 CHR_BEG => $beg + $i - 1,
945 CHR_END => $beg + $c - 2,
960 while ( $i < @blocks )
964 while ( $c < @blocks and $blocks[ $c ]->{ "CHR_BEG" } - $blocks[ $c - 1 ]->{ "CHR_END" } <= $args->{ "dist" } )
969 push @super_blocks, [ @blocks[ $i .. $c - 1 ] ];
974 foreach $super_block ( @super_blocks )
976 foreach $block ( @{ $super_block } )
978 push @begs, $block->{ "CHR_BEG" } - $super_block->[ 0 ]->{ "CHR_BEG" };
979 push @lens, $block->{ "CHR_LEN" } - 1;
985 CHR => $super_block->[ 0 ]->{ "CHR" },
986 CHR_BEG => $super_block->[ 0 ]->{ "CHR_BEG" },
987 CHR_END => $super_block->[ -1 ]->{ "CHR_END" },
991 THICK_BEG => $super_block->[ 0 ]->{ "CHR_BEG" },
992 THICK_END => $super_block->[ -1 ]->{ "CHR_END" } + 1,
993 ITEMRGB => "0,200,100",
994 BLOCKCOUNT => scalar @{ $super_block },
995 BLOCKSIZES => join( ",", @lens ),
996 Q_BEGS => join( ",", @begs ),
1003 return wantarray ? @entries : \@entries;
1007 sub phastcons_index_create
1009 # Martin A. Hansen, January 2008.
1011 # Indexes a concatenated PhastCons file.
1012 # The index consists of a hash with chromosomes as keys,
1013 # and a list of [ chr_beg, next_chr_beg, chr_end, index_beg, index_len ] as values.
1015 my ( $path, # path to PhastCons file
1020 my ( $fh, $pos, $index_beg, $index_len, $entry, $locator, $chr, $step, $beg, $end, $len, %index, $i );
1022 $fh = &Maasha::Common::read_open( $path );
1026 while ( $entry = &Maasha::UCSC::phastcons_get_entry( $fh ) )
1028 $locator = shift @{ $entry };
1030 if ( $locator =~ /chrom=([^ ]+) start=(\d+) step=(\d+)/ )
1033 $beg = $2 - 1; # phastcons files are 1-based
1038 &Maasha::Common::error( qq(Could not parse PhastCons locator: $locator) );
1041 $pos += length( $locator ) + 11;
1045 # map { $pos += length( $_ ) + 1 } @{ $entry };
1047 $pos += 6 * scalar @{ $entry };
1049 $index_len = $pos - $index_beg;
1051 push @{ $index{ $chr } }, [ $beg, undef, $beg + scalar @{ $entry } - 1, $index_beg, $index_len ];
1056 foreach $chr ( keys %index )
1058 for ( $i = 0; $i < @{ $index{ $chr } } - 1; $i++ ) {
1059 $index{ $chr }->[ $i ]->[ NEXT_CHR_BEG ] = $index{ $chr }->[ $i + 1 ]->[ 0 ];
1062 $index{ $chr }->[ -1 ]->[ NEXT_CHR_BEG ] = $index{ $chr }->[ -1 ]->[ CHR_END ] + 1;
1065 return wantarray ? %index : \%index;
1069 sub phastcons_index_store
1071 # Martin A. Hansen, January 2008.
1073 # Writes a PhastCons index to binary file.
1075 my ( $path, # full path to file
1076 $index, # list with index
1081 &Maasha::Common::file_store( $path, $index );
1085 sub phastcons_index_retrieve
1087 # Martin A. Hansen, January 2008.
1089 # Retrieves a PhastCons index from binary file.
1091 my ( $path, # full path to file
1098 $index = &Maasha::Common::file_retrieve( $path );
1100 return wantarray ? %{ $index } : $index;
1104 sub phastcons_index_lookup
1106 # Martin A. Hansen, January 2008.
1108 # Retrieve PhastCons scores from a indexed
1109 # Phastcons file given a chromosome and
1110 # begin and end positions.
1112 my ( $index, # data structure
1113 $fh, # filehandle to datafile
1115 $chr_beg, # chromosome beg
1116 $chr_end, # chromosome end
1117 $flank, # include flanking region - OPTIONAL
1122 my ( $index_beg, $index_end, $i, $c, $beg, $end, @vals, $scores );
1129 # print "chr_beg->$chr_beg chr_end->$chr_end flank->$flank\n";
1131 if ( exists $index->{ $chr } )
1133 $index_beg = &Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_beg );
1135 if ( $index_beg < 0 ) {
1136 &Maasha::Common::error( qq(Index search failed - begin index position doesn't exists: $chr_beg) );
1139 if ( $chr_end < $index->{ $chr }->[ $index_beg ]->[ 1 ] )
1141 $index_end = $index_beg;
1145 $index_end = &Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_end );
1147 if ( $index_end < 0 ) {
1148 &Maasha::Common::error( qq(Index search failed - end index position doesn't exists: $chr_end) );
1152 map { $scores->[ $_ ] = 0 } 0 .. $chr_end - $chr_beg;
1154 if ( $index_beg == $index_end )
1156 $beg = &Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ CHR_BEG ] );
1157 $end = &Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ CHR_END ] );
1159 if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ CHR_END ] and $end >= $index->{ $chr }->[ $index_beg ]->[ CHR_BEG ] )
1161 @vals = split "\n", &Maasha::Common::file_read(
1163 $index->{ $chr }->[ $index_beg ]->[ INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ CHR_BEG ] ),
1164 6 * ( $end - $beg + 1 ),
1168 for ( $c = 0; $c < @vals; $c++ ) {
1169 $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ];
1174 $beg = &Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ CHR_BEG ] );
1176 # print Dumper( $beg, $index->{ $chr }->[ $index_beg ] );
1177 # print Dumper( "next", $index->{ $chr }->[ $index_beg ]->[ NEXT_CHR_BEG ] );
1183 if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ CHR_END ] )
1185 @vals = split "\n", &Maasha::Common::file_read(
1187 $index->{ $chr }->[ $index_beg ]->[ INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ CHR_BEG ] ),
1188 6 * ( $index->{ $chr }->[ $index_beg ]->[ CHR_END ] - $beg + 1 ),
1191 for ( $c = 0; $c < @vals; $c++ ) {
1192 $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ];
1196 $end = &Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ CHR_END ] );
1198 if ( $end <= $index->{ $chr }->[ $index_end ]->[ CHR_END ] )
1200 @vals = split "\n", &Maasha::Common::file_read(
1202 $index->{ $chr }->[ $index_end ]->[ INDEX_BEG ],
1203 6 * ( $end - $index->{ $chr }->[ $index_end ]->[ CHR_BEG ] + 1 ),
1206 for ( $c = 0; $c < @vals; $c++ ) {
1207 $scores->[ $c + $index->{ $chr }->[ $index_end ]->[ CHR_BEG ] - $chr_beg ] = $vals[ $c ];
1211 for ( $i = $index_beg + 1; $i <= $index_end - 1; $i++ )
1213 @vals = split "\n", &Maasha::Common::file_read(
1215 $index->{ $chr }->[ $i ]->[ INDEX_BEG ],
1216 6 * ( $index->{ $chr }->[ $i ]->[ CHR_END ] - $index->{ $chr }->[ $i ]->[ CHR_BEG ] + 1 ),
1219 for ( $c = 0; $c < @vals; $c++ ) {
1220 $scores->[ $c + $index->{ $chr }->[ $i ]->[ CHR_BEG ] - $chr_beg ] = $vals[ $c ];
1227 &Maasha::Common::error( qq(Chromosome "$chr" was not found in index) );
1230 return wantarray ? @{ $scores } : $scores;
1234 sub phastcons_normalize
1236 # Martin A. Hansen, January 2008.
1238 # Normalizes a list of lists with PhastCons scores,
1239 # in such a way that each list contains the same number
1240 # or PhastCons scores.
1242 my ( $AoA, # AoA with PhastCons scores
1247 my ( $list, $max, $min, $mean, $diff );
1252 foreach $list ( @{ $AoA } )
1254 $min = scalar @{ $list } if scalar @{ $list } < $min;
1255 $max = scalar @{ $list } if scalar @{ $list } > $max;
1258 $mean = int( ( $min + $max ) / 2 );
1260 # print STDERR "min->$min max->$max mean->$mean\n";
1262 foreach $list ( @{ $AoA } )
1264 $diff = scalar @{ $list } - $mean;
1266 &phastcons_list_inflate( $list, abs( $diff ) ) if $diff < 0;
1267 &phastcons_list_deflate( $list, $diff ) if $diff > 0;
1270 return wantarray ? @{ $AoA } : $AoA;
1274 sub phastcons_list_inflate
1276 # Martin A. Hansen, January 2008.
1278 # Inflates a list with a given number of elements
1279 # in such a way that the extra elements are introduced
1280 # evenly over the entire length of the list. The value
1281 # of the extra elements is based on a mean of the
1282 # adjacent elements.
1284 my ( $list, # list of elements
1285 $diff, # number of elements to introduce
1290 my ( $len, $space, $i, $pos );
1292 $len = scalar @{ $list };
1294 $space = $len / $diff;
1296 for ( $i = 0; $i < $diff; $i++ )
1298 $pos = int( ( $space / 2 ) + $i * $space );
1300 splice @{ $list }, $pos, 0, ( $list->[ $pos - 1 ] + $list->[ $pos + 1 ] ) / 2;
1301 # splice @{ $list }, $pos, 0, "X";
1304 die qq(ERROR: bad inflate\n) if scalar @{ $list } != $len + $diff;
1308 sub phastcons_list_deflate
1310 # Martin A. Hansen, January 2008.
1312 # Deflates a list by removing a given number of elements
1313 # evenly distributed over the entire list.
1315 my ( $list, # list of elements
1316 $diff, # number of elements to remove
1321 my ( $len, $space, $i, $pos );
1323 $len = scalar @{ $list };
1325 $space = ( $len - $diff ) / $diff;
1327 for ( $i = 0; $i < $diff; $i++ )
1329 $pos = int( ( $space / 2 ) + $i * $space );
1331 splice @{ $list }, $pos, 1;
1334 die qq(ERROR: bad deflate\n) if scalar @{ $list } != $len - $diff;
1340 # Martin A. Hansen, January 2008.
1342 # Given a normalized PhastCons matrix in an AoA,
1343 # calculate the mean for each column and return as a list.
1345 my ( $AoA, # AoA with normalized PhastCons scores
1352 $AoA = &Maasha::Matrix::matrix_flip( $AoA );
1354 map { push @list, &Maasha::Calc::mean( $_ ) } @{ $AoA };
1356 return wantarray ? @list : \@list;
1360 sub phastcons_median
1362 # Martin A. Hansen, January 2008.
1364 # Given a normalized PhastCons matrix in an AoA,
1365 # calculate the median for each column and return as a list.
1367 my ( $AoA, # AoA with normalized PhastCons scores
1374 $AoA = &Maasha::Matrix::matrix_flip( $AoA );
1376 map { push @list, &Maasha::Calc::median( $_ ) } @{ $AoA };
1378 return wantarray ? @list : \@list;
1382 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MULTIPLE ALIGNMENT FILES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1387 # Martin A. Hansen, April 2008.
1389 # Executes mafFrag to extract a subalignment from a multiz track
1390 # in the UCSC genome browser database.
1392 my ( $tmp_dir, # temporary directory
1393 $database, # genome database
1394 $table, # table with the multiz track
1396 $beg, # begin position
1397 $end, # end position
1401 # Returns a list of record
1403 my ( $tmp_file, $align );
1405 $tmp_file = "$tmp_dir/maf_extract.maf";
1407 &Maasha::Common::run( "mafFrag", "$database $table $chr $beg $end $strand $tmp_file" );
1409 $align = &maf_parse( $tmp_file );
1413 return wantarray ? @{ $align } : $align;
1419 # Martin A. Hansen, April 2008.
1422 my ( $path, # full path to MAF file
1425 # Returns a list of record.
1427 my ( $fh, $line, @fields, @align );
1429 $fh = &Maasha::Common::read_open( $path );
1431 while ( $line = <$fh> )
1435 if ( $line =~ /^s/ )
1437 @fields = split / /, $line;
1440 SEQ_NAME => $fields[ 1 ],
1441 SEQ => $fields[ -1 ],
1443 ALIGN_LEN => length $fields[ -1 ],
1450 return wantarray ? @align : \@align;
1454 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> WIGGLE FORMAT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1457 sub fixedstep_put_entry
1459 # Martin A. Hansen, April 2008.
1461 # Outputs a block of fixedStep values.
1462 # Used for outputting wiggle data.
1464 my ( $chr, # chromosome
1465 $beg, # start position
1466 $block, # list of scores
1467 $fh, # filehandle - OPTIONAL
1472 $beg += 1; # fixedStep format is 1 based.
1476 print $fh "fixedStep chrom=$chr start=$beg step=1\n";
1478 map { printf( $fh "%d\n", ( $_ + 1 ) ) } @{ $block };
1482 print "fixedStep chrom=$chr start=$beg step=1\n";
1484 map { printf( "%d\n", ( $_ + 1 ) ) } @{ $block };
1489 sub wiggle_upload_to_ucsc
1491 # Martin A. Hansen, May 2008.
1493 # Upload a wiggle file to the UCSC database.
1495 my ( $tmp_dir, # temporary directory
1496 $wib_dir, # wib directory
1497 $wig_file, # file to upload,
1498 $options, # argument hashref
1505 # $args = join " ", "-tmpDir=$tmp_dir", "-pathPrefix=$wib_dir", $options->{ "database" }, $options->{ 'table' }, $wig_file;
1507 # &Maasha::Common::run( "hgLoadWiggle", "$args > /dev/null 2>&1" );
1509 `cd $tmp_dir && hgLoadWiggle -tmpDir=$tmp_dir -pathPrefix=$wib_dir $options->{ 'database' } $options->{ 'table' } $wig_file > /dev/null 2>&1`;
1513 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MySQL CONF <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1518 # Martin A. Hansen, May 2008
1520 # Fetches the MySQL database user name from the
1521 # .hg.conf file in the users home directory.
1525 my ( $fh, $line, $user );
1527 $fh = &Maasha::Common::read_open( "$ENV{ 'HOME' }/.hg.conf" );
1529 while ( $line = <$fh> )
1533 if ( $line =~ /^db\.user=(.+)/ )
1547 sub ucsc_get_password
1549 # Martin A. Hansen, May 2008
1551 # Fetches the MySQL database password from the
1552 # .hg.conf file in the users home directory.
1556 my ( $fh, $line, $password );
1558 $fh = &Maasha::Common::read_open( "$ENV{ 'HOME' }/.hg.conf" );
1560 while ( $line = <$fh> )
1564 if ( $line =~ /^db\.password=(.+)/ )
1578 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<