1 package Maasha::UCSC::BED;
3 # Copyright (C) 2007-2008 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Routines for interaction with Browser Extensible DATA (BED) entries and files.
27 # Read about the BED format here: http://genome.ucsc.edu/FAQ/FAQformat#format1
30 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
40 use vars qw( @ISA @EXPORT_OK );
44 @ISA = qw( Exporter );
50 chrom => 0, # BED field names
62 CHR => 0, # Biopieces field names
77 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BED format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
80 # Hash for converting BED keys to Biopieces keys.
84 chromStart => "CHR_BEG",
85 chromEnd => "CHR_END",
89 thickStart => "THICK_BEG",
90 thickEnd => "THICK_END",
92 blockCount => "BLOCK_COUNT",
93 blockSizes => "BLOCK_LENS",
94 blockStarts => "Q_BEGS",
98 # Hash for converting biopieces keys to BED keys.
100 my %BIOPIECES2BED = (
102 CHR_BEG => "chromStart",
103 CHR_END => "chromEnd",
107 THICK_BEG => "thickStart",
108 THICK_END => "thickEnd",
110 BLOCK_COUNT => "blockCount",
111 BLOCK_LENS => "blockSizes",
112 Q_BEGS => "blockStarts",
118 # Martin A. Hansen, September 2008.
120 # Reads a BED entry given a filehandle.
122 my ( $fh, # file handle
123 $cols, # columns to read - OPTIONAL (3,4,5,6,8,9 or 12)
124 $check, # check integrity of BED values - OPTIONAL
129 my ( $line, %entry );
133 $line =~ tr/\n\r//d; # some people have carriage returns in their BED files -> Grrrr
135 return if not defined $line;
137 if ( not defined $cols ) {
138 $cols = 1 + $line =~ tr/\t//;
145 $entry{ 'chromStart' },
147 ) = split "\t", $line, $cols + 1;
153 $entry{ 'chromStart' },
154 $entry{ 'chromEnd' },
156 ) = split "\t", $line, $cols + 1;
162 $entry{ 'chromStart' },
163 $entry{ 'chromEnd' },
166 ) = split "\t", $line, $cols + 1;
172 $entry{ 'chromStart' },
173 $entry{ 'chromEnd' },
177 ) = split "\t", $line, $cols + 1;
183 $entry{ 'chromStart' },
184 $entry{ 'chromEnd' },
188 $entry{ 'thickStart' },
189 $entry{ 'thickEnd' },
190 ) = split "\t", $line, $cols + 1;
196 $entry{ 'chromStart' },
197 $entry{ 'chromEnd' },
201 $entry{ 'thickStart' },
202 $entry{ 'thickEnd' },
204 ) = split "\t", $line, $cols + 1;
206 elsif ( $cols == 12 )
210 $entry{ 'chromStart' },
211 $entry{ 'chromEnd' },
215 $entry{ 'thickStart' },
216 $entry{ 'thickEnd' },
218 $entry{ 'blockCount' },
219 $entry{ 'blockSizes' },
220 $entry{ 'blockStarts' }
221 ) = split "\t", $line, $cols + 1;
225 Maasha::Common::error( qq(Bad BED entry column count: $cols) );
228 bed_entry_check( \%entry ) if $check;
230 return wantarray ? %entry : \%entry;
234 sub bed_entry_get_list
236 # Martin A. Hansen, September 2008.
238 # Reads a BED entry given a filehandle.
240 my ( $fh, # file handle
241 $cols, # columns to read - OPTIONAL (3,4,5,6,8,9 or 12)
242 $check, # check integrity of BED values - OPTIONAL
247 my ( $line, @entry );
251 $line =~ tr/\n\r//d; # some people have carriage returns in their BED files -> Grrrr
253 return if not defined $line;
255 if ( not defined $cols ) {
256 $cols = 1 + $line =~ tr/\t//;
259 @entry = split "\t", $line, $cols + 1;
261 pop @entry if scalar @entry > $cols;
263 bed_entry_check( \@entry ) if $check;
265 return wantarray ? @entry : \@entry;
271 # Martin A. Hansen, September 2008.
273 # Writes a BED entry array to file.
276 $fh, # file handle - OPTIONAL
277 $cols, # number of columns in BED file - OPTIONAL (3,4,5,6,8,9 or 12)
278 $check, # check integrity of BED values - OPTIONAL
283 $cols = scalar keys %{ $entry } if not $cols;
285 bed_entry_check( $entry ) if $check;
287 $fh = \*STDOUT if not defined $fh;
291 print $fh join( "\t",
293 $entry->{ 'chromStart' },
294 $entry->{ 'chromEnd' },
299 print $fh join( "\t",
301 $entry->{ 'chromStart' },
302 $entry->{ 'chromEnd' },
308 print $fh join( "\t",
310 $entry->{ 'chromStart' },
311 $entry->{ 'chromEnd' },
318 print $fh join( "\t",
320 $entry->{ 'chromStart' },
321 $entry->{ 'chromEnd' },
324 $entry->{ 'strand' },
329 print $fh join( "\t",
331 $entry->{ 'chromStart' },
332 $entry->{ 'chromEnd' },
335 $entry->{ 'strand' },
336 $entry->{ 'thickStart' },
337 $entry->{ 'thickEnd' },
342 print $fh join( "\t",
344 $entry->{ 'chromStart' },
345 $entry->{ 'chromEnd' },
348 $entry->{ 'strand' },
349 $entry->{ 'thickStart' },
350 $entry->{ 'thickEnd' },
351 $entry->{ 'itemRgb' },
354 elsif ( $cols == 12 )
356 print $fh join( "\t",
358 $entry->{ 'chromStart' },
359 $entry->{ 'chromEnd' },
362 $entry->{ 'strand' },
363 $entry->{ 'thickStart' },
364 $entry->{ 'thickEnd' },
365 $entry->{ 'itemRgb' },
366 $entry->{ 'blockCount' },
367 $entry->{ 'blockSizes' },
368 $entry->{ 'blockStarts' },
373 Maasha::Common::error( qq(Bad BED entry column count: $cols) );
380 # Martin A. Hansen, November 2008.
382 # Checks a BED entry for integrity and
383 # raises an error if there is a problem.
385 my ( $bed, # hash ref
390 my ( $cols, @block_sizes, @block_starts );
392 $cols = scalar keys %{ $bed };
395 Maasha::Common::error( qq(Bad BED entry - must contain at least 3 columns - not $cols) );
399 Maasha::Common::error( qq(Bad BED entry - must contains no more than 12 columns - not $cols) );
402 if ( $bed->{ 'chrom' } =~ /\s/ ) {
403 Maasha::Common::error( qq(Bad BED entry - no white space allowed in chrom field: "$bed->{ 'chrom' }") );
406 if ( $bed->{ 'chromStart' } =~ /\D/ ) {
407 Maasha::Common::error( qq(Bad BED entry - chromStart must be a whole number - not "$bed->{ 'chromStart' }") );
410 if ( $bed->{ 'chromEnd' } =~ /\D/ ) {
411 Maasha::Common::error( qq(Bad BED entry - chromEnd must be a whole number - not "$bed->{ 'chromEnd' }") );
414 if ( $bed->{ 'chromEnd' } < $bed->{ 'chromStart' } ) {
415 Maasha::Common::error( qq(Bad BED entry - chromEnd must be greater than chromStart - not "$bed->{ 'chromStart' } > $bed->{ 'chromEnd' }") );
418 return if $cols == 3;
420 if ( $bed->{ 'name' } =~ /\s/ ) {
421 Maasha::Common::error( qq(Bad BED entry - no white space allowed in name field: "$bed->{ 'name' }") );
424 return if $cols == 4;
426 if ( $bed->{ 'score' } =~ /\D/ ) {
427 Maasha::Common::error( qq(Bad BED entry - score must be a whole number - not "$bed->{ 'score' }") );
430 # if ( $bed->{ 'score' } < 0 or $bed->{ 'score' } > 1000 ) { # disabled - too restrictive !
431 if ( $bed->{ 'score' } < 0 ) {
432 Maasha::Common::error( qq(Bad BED entry - score must be between 0 and 1000 - not "$bed->{ 'score' }") );
435 return if $cols == 5;
437 if ( $bed->{ 'strand' } ne '+' and $bed->{ 'strand' } ne '-' ) {
438 Maasha::Common::error( qq(Bad BED entry - strand must be + or - not "$bed->{ 'strand' }") );
441 return if $cols == 6;
443 if ( $bed->{ 'thickStart' } =~ /\D/ ) {
444 Maasha::Common::error( qq(Bad BED entry - thickStart must be a whole number - not "$bed->{ 'thickStart' }") );
447 if ( $bed->{ 'thickEnd' } =~ /\D/ ) {
448 Maasha::Common::error( qq(Bad BED entry - thickEnd must be a whole number - not "$bed->{ 'thickEnd' }") );
451 if ( $bed->{ 'thickEnd' } < $bed->{ 'thickStart' } ) {
452 Maasha::Common::error( qq(Bad BED entry - thickEnd must be greater than thickStart - not "$bed->{ 'thickStart' } > $bed->{ 'thickEnd' }") );
455 if ( $bed->{ 'thickStart' } < $bed->{ 'chromStart' } ) {
456 Maasha::Common::error( qq(Bad BED entry - thickStart must be greater than chromStart - not "$bed->{ 'thickStart' } < $bed->{ 'chromStart' }") );
459 if ( $bed->{ 'thickStart' } > $bed->{ 'chromEnd' } ) {
460 Maasha::Common::error( qq(Bad BED entry - thickStart must be less than chromEnd - not "$bed->{ 'thickStart' } > $bed->{ 'chromEnd' }") );
463 if ( $bed->{ 'thickEnd' } < $bed->{ 'chromStart' } ) {
464 Maasha::Common::error( qq(Bad BED entry - thickEnd must be greater than chromStart - not "$bed->{ 'thickEnd' } < $bed->{ 'chromStart' }") );
467 if ( $bed->{ 'thickEnd' } > $bed->{ 'chromEnd' } ) {
468 Maasha::Common::error( qq(Bad BED entry - thickEnd must be less than chromEnd - not "$bed->{ 'thickEnd' } > $bed->{ 'chromEnd' }") );
471 return if $cols == 8;
473 if ( $bed->{ 'itemRgb' } !~ /^(0|\d{1,3},\d{1,3},\d{1,3},?)$/ ) {
474 Maasha::Common::error( qq(Bad BED entry - itemRgb must be 0 or in the form of 255,0,0 - not "$bed->{ 'itemRgb' }") );
477 return if $cols == 9;
479 if ( $bed->{ 'blockCount' } =~ /\D/ ) {
480 Maasha::Common::error( qq(Bad BED entry - blockCount must be a whole number - not "$bed->{ 'blockCount' }") );
483 @block_sizes = split ",", $bed->{ 'blockSizes' };
485 if ( grep /\D/, @block_sizes ) {
486 Maasha::Common::error( qq(Bad BED entry - blockSizes must be whole numbers - not "$bed->{ 'blockSizes' }") );
489 if ( $bed->{ 'blockCount' } != scalar @block_sizes ) {
490 Maasha::Common::error( qq(Bad BED entry - blockSizes "$bed->{ 'blockSizes' }" must match blockCount "$bed->{ 'blockCount' }") );
493 @block_starts = split ",", $bed->{ 'blockStarts' };
495 if ( grep /\D/, @block_starts ) {
496 Maasha::Common::error( qq(Bad BED entry - blockStarts must be whole numbers - not "$bed->{ 'blockStarts' }") );
499 if ( $bed->{ 'blockCount' } != scalar @block_starts ) {
500 Maasha::Common::error( qq(Bad BED entry - blockStarts "$bed->{ 'blockStarts' }" must match blockCount "$bed->{ 'blockCount' }") );
503 if ( $bed->{ 'chromStart' } + $block_starts[ -1 ] + $block_sizes[ -1 ] != $bed->{ 'chromEnd' } ) {
504 Maasha::Common::error( qq(Bad BED entry - chromStart + blockStarts[last] + blockSizes[last] must equal chromEnd: ) .
505 qq($bed->{ 'chromStart' } + $block_starts[ -1 ] + $block_sizes[ -1 ] != $bed->{ 'chromEnd' }) );
512 # Martin A. hansen, November 2008.
514 # Sorts a given BED file according to a given
516 # 1: chr AND chr_beg.
517 # 2: chr AND strand AND chr_beg.
519 # 4: strand AND chr_beg.
521 # Currently 'bed_sort' is used for sorting = a standalone c program
522 # that uses qsort (unstable sort).
524 my ( $file_in, # path to BED file.
525 $scheme, # sort scheme
526 $cols, # number of columns in BED file
533 $file_out = "$file_in.sort";
535 Maasha::Common::run( "bed_sort", "--sort $scheme --cols $cols $file_in > $file_out" );
537 if ( Maasha::Filesys::file_size( $file_in ) != Maasha::Filesys::file_size( $file_out ) ) {
538 Maasha::Common::error( qq(bed_sort of file "$file_in" failed) );
541 rename $file_out, $file_in;
545 sub bed_file_split_on_chr
547 # Martin A. Hansen, November 2008.
549 # Given a path to a BED file splits
550 # this file into one file per chromosome
551 # in a temporary directory. Returns a hash
552 # with chromosome name as key and the corresponding
555 my ( $file, # path to BED file
556 $dir, # working directory
557 $cols, # number of BED columns to read - OPTIONAL
562 my ( $fh_in, $fh_out, $entry, %fh_hash, %file_hash );
564 $fh_in = Maasha::Filesys::file_read_open( $file );
566 while ( $entry = bed_entry_get( $fh_in, $cols ) )
568 if ( not exists $file_hash{ $entry->[ chrom ] } )
570 $fh_hash{ $entry->[ chrom ] } = Maasha::Filesys::file_write_open( "$dir/$entry->[ chrom ]" );
571 $file_hash{ $entry->[ chrom ] } = "$dir/$entry->[ chrom ]";
574 $fh_out = $fh_hash{ $entry->[ chrom ] };
576 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $cols );
579 map { close $_ } keys %fh_hash;
581 return wantarray ? %file_hash : \%file_hash;
585 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BIOPIECES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
590 # Martin A. Hansen, November 2008.
592 # Converts a BED entry given as a hashref
593 # to a Biopiece record which is returned as
596 # IMPORTANT! The BED_END and THICK_END positions
597 # will be the exact position in contrary to the
600 my ( $bed_entry, # BED entry as hashref
605 my ( $cols, %bp_record );
607 $cols = scalar keys %{ $bed_entry };
609 if ( not exists $bed_entry->{ 'chrom' } and
610 not exists $bed_entry->{ 'chromStart' } and
611 not exists $bed_entry->{ 'chromEnd' } )
616 $bp_record{ 'REC_TYPE' } = 'BED';
617 $bp_record{ 'BED_COLS' } = $cols;
618 $bp_record{ 'CHR' } = $bed_entry->{ 'chrom' };
619 $bp_record{ 'CHR_BEG' } = $bed_entry->{ 'chromStart' };
620 $bp_record{ 'CHR_END' } = $bed_entry->{ 'chromEnd' } - 1;
621 $bp_record{ 'BED_LEN' } = $bed_entry->{ 'chromEnd' } - $bed_entry->{ 'chromStart' };
623 return wantarray ? %bp_record : \%bp_record if $cols == 3;
625 $bp_record{ 'Q_ID' } = $bed_entry->{ 'name' };
627 return wantarray ? %bp_record : \%bp_record if $cols == 4;
629 $bp_record{ 'SCORE' } = $bed_entry->{ 'score' };
631 return wantarray ? %bp_record : \%bp_record if $cols == 5;
633 $bp_record{ 'STRAND' } = $bed_entry->{ 'strand' };
635 return wantarray ? %bp_record : \%bp_record if $cols == 6;
637 $bp_record{ 'THICK_BEG' } = $bed_entry->{ 'thickStart' };
638 $bp_record{ 'THICK_END' } = $bed_entry->{ 'thickEnd' } - 1;
640 return wantarray ? %bp_record : \%bp_record if $cols == 8;
642 $bp_record{ 'COLOR' } = $bed_entry->{ 'itemRgb' };
644 return wantarray ? %bp_record : \%bp_record if $cols == 9;
646 $bp_record{ 'BLOCK_COUNT' } = $bed_entry->{ 'blockCount' };
647 $bp_record{ 'BLOCK_LENS' } = $bed_entry->{ 'blockSizes' };
648 $bp_record{ 'Q_BEGS' } = $bed_entry->{ 'blockStarts' };
650 return wantarray ? %bp_record : \%bp_record;
656 # Martin A. Hansen, November 2008.
658 # Converts a Biopieces record given as a hashref
659 # to a BED record which is returned as
660 # an arrayref. As much as possible of the Biopiece
661 # record is converted and undef is returned if
664 # IMPORTANT! The BED_END and THICK_END positions
665 # will be the inexact position used in the
668 my ( $bp_record, # hashref
669 $cols, # number of columns in BED entry - OPTIONAL (but faster)
676 $cols ||= 12; # max number of columns possible
678 $bed_entry{ 'chrom' } = $bp_record->{ 'CHR' } ||
679 $bp_record->{ 'S_ID' } ||
682 if ( exists $bp_record->{ 'CHR_BEG' } ) {
683 $bed_entry{ 'chromStart' } = $bp_record->{ 'CHR_BEG' };
684 } elsif ( exists $bp_record->{ 'S_BEG' } ) {
685 $bed_entry{ 'chromStart' } = $bp_record->{ 'S_BEG' };
690 if ( exists $bp_record->{ 'CHR_END' } ) {
691 $bed_entry{ 'chromEnd' } = $bp_record->{ 'CHR_END' };
692 } elsif ( exists $bp_record->{ 'S_END' }) {
693 $bed_entry{ 'chromEnd' } = $bp_record->{ 'S_END' };
698 $bed_entry{ 'chromEnd' }++;
700 return wantarray ? %bed_entry : \%bed_entry if $cols == 3;
702 $bed_entry{ 'name' } = $bp_record->{ 'Q_ID' } || return wantarray ? %bed_entry : \%bed_entry;
704 return wantarray ? %bed_entry : \%bed_entry if $cols == 4;
706 if ( exists $bp_record->{ 'SCORE' } ) {
707 $bed_entry{ 'score' } = $bp_record->{ 'SCORE' };
709 return wantarray ? %bed_entry : \%bed_entry;
712 return wantarray ? %bed_entry : \%bed_entry if $cols == 5;
714 if ( exists $bp_record->{ 'STRAND' } ) {
715 $bed_entry{ 'strand' } = $bp_record->{ 'STRAND' };
717 return wantarray ? %bed_entry : \%bed_entry;
720 return wantarray ? %bed_entry : \%bed_entry if $cols == 6;
722 if ( defined $bp_record->{ 'THICK_BEG' } and
723 defined $bp_record->{ 'THICK_END' } )
725 $bed_entry{ 'thickStart' } = $bp_record->{ 'THICK_BEG' };
726 $bed_entry{ 'thickEnd' } = $bp_record->{ 'THICK_END' };
729 return wantarray ? %bed_entry : \%bed_entry if $cols == 8;
731 if ( exists $bp_record->{ 'COLOR' } ) {
732 $bed_entry{ 'itemRgb' } = $bp_record->{ 'COLOR' };
734 return wantarray ? %bed_entry : \%bed_entry;
737 return wantarray ? %bed_entry : \%bed_entry if $cols == 9;
739 if ( defined $bp_record->{ 'BLOCK_COUNT' } and
740 defined $bp_record->{ 'BLOCK_LENS' } and
741 defined $bp_record->{ 'Q_BEGS' } )
743 $bed_entry{ 'blockCount' } = $bp_record->{ 'BLOCK_COUNT' };
744 $bed_entry{ 'blockSizes' } = $bp_record->{ 'BLOCK_LENS' };
745 $bed_entry{ 'blockStarts' } = $bp_record->{ 'Q_BEGS' };
746 $bed_entry{ 'thickEnd' }++;
749 return wantarray ? %bed_entry : \%bed_entry;
753 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> TAG CONTIGS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
756 sub tag_contigs_assemble
758 # Martin A. Hansen, November 2008.
760 # Returns a path to a BED file with tab contigs.
762 my ( $bed_file, # path to BED file
765 $dir, # working directory
770 my ( $fh_in, $fh_out, $array, $new_file, $pos, $id, $beg, $end, $score, $entry );
772 $fh_in = Maasha::Filesys::file_read_open( $bed_file );
774 $array = tag_contigs_assemble_array( $fh_in );
778 $new_file = "$bed_file.tag_contigs";
780 $fh_out = Maasha::Filesys::file_write_open( $new_file );
785 while ( ( $beg, $end, $score ) = tag_contigs_scan( $array, $pos ) and $beg )
787 $entry->[ chrom ] = $chr;
788 $entry->[ chromStart ] = $beg;
789 $entry->[ chromEnd ] = $end;
790 $entry->[ name ] = sprintf( "TC%06d", $id );
791 $entry->[ score ] = $score;
792 $entry->[ strand ] = $strand;
794 bed_entry_put( $entry, $fh_out );
806 sub tag_contigs_assemble_array
808 # Martin A. Hansen, November 2008.
810 # Given a BED file with entries from only one
811 # chromosome assembles tag contigs from these
812 # ignoring strand information. Only tags with
813 # a score higher than the clone count over
814 # genomic loci (the score field) is included
815 # in the tag contigs.
821 # ======================== tag contig
824 my ( $fh, # file handle to BED file
827 # Returns an arrayref.
829 my ( $entry, $clones, $score, @array );
831 while ( $entry = bed_entry_get( $fh ) )
833 if ( $entry->[ name ] =~ /_(\d+)$/ )
837 if ( $entry->[ score ] )
839 $score = int( $clones / $entry->[ score ] );
841 map { $array[ $_ ] += $score } $entry->[ chromStart ] .. $entry->[ chromEnd ] - 1 if $score >= 1;
846 return wantarray ? @array : \@array;
852 # Martin A. Hansen, November 2008.
854 # Scans an array with tag contigs and locates
855 # the next contig from a given position. The
856 # score of the tag contig is determined as the
857 # maximum value of the tag contig. If a tag contig
858 # is found a triple is returned with beg, end and score
859 # otherwise an empty list is returned.
861 my ( $array, # array to scan
862 $beg, # position to start scanning from
865 # Returns an arrayref.
871 while ( $beg < scalar @{ $array } and not $array->[ $beg ] ) { $beg++ }
875 while ( $array->[ $end ] )
877 $score = Maasha::Calc::max( $score, $array->[ $end ] );
883 return wantarray ? ( $beg, $end, $score ) : [ $beg, $end, $score ];
885 return wantarray ? () : [];
890 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<