1 package Maasha::UCSC::BED;
3 # Copyright (C) 2007-2008 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Routines for interaction with Browser Extensible DATA (BED) entries and files.
27 # Read about the BED format here: http://genome.ucsc.edu/FAQ/FAQformat#format1
30 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
41 use vars qw( @ISA @EXPORT_OK );
45 @ISA = qw( Exporter );
51 chrom => 0, # BED field names
63 CHR => 0, # Biopieces field names
78 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BED format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
81 # Hash for converting BED keys to Biopieces keys.
85 chromStart => "CHR_BEG",
86 chromEnd => "CHR_END",
90 thickStart => "THICK_BEG",
91 thickEnd => "THICK_END",
93 blockCount => "BLOCK_COUNT",
94 blockSizes => "BLOCK_LENS",
95 blockStarts => "Q_BEGS",
99 # Hash for converting biopieces keys to BED keys.
101 my %BIOPIECES2BED = (
103 CHR_BEG => "chromStart",
104 CHR_END => "chromEnd",
108 THICK_BEG => "thickStart",
109 THICK_END => "thickEnd",
111 BLOCK_COUNT => "blockCount",
112 BLOCK_LENS => "blockSizes",
113 Q_BEGS => "blockStarts",
119 # Martin A. Hansen, September 2008.
121 # Reads a BED entry given a filehandle.
123 my ( $fh, # file handle
124 $cols, # columns to read - OPTIONAL (3,4,5,6,8,9 or 12)
125 $check, # check integrity of BED values - OPTIONAL
130 my ( $line, @entry );
134 return if not defined $line;
136 $line =~ tr/\n\r//d; # some people have carriage returns in their BED files -> Grrrr
138 if ( not defined $cols ) {
139 $cols = 1 + $line =~ tr/\t//;
142 @entry = split "\t", $line, $cols + 1;
144 pop @entry if scalar @entry > $cols;
146 bed_entry_check( \@entry ) if $check;
148 return wantarray ? @entry : \@entry;
154 # Martin A. Hansen, September 2008.
156 # Writes a BED entry array to file.
159 $fh, # file handle - OPTIONAL
160 $cols, # number of columns in BED file - OPTIONAL (3,4,5,6,8,9 or 12)
161 $check, # check integrity of BED values - OPTIONAL
166 if ( $cols and $cols < scalar @{ $entry } ) {
167 @{ $entry } = @{ $entry }[ 0 .. $cols - 1 ];
170 bed_entry_check( $entry ) if $check;
172 $fh = \*STDOUT if not defined $fh;
174 print $fh join( "\t", @{ $entry } ), "\n";
181 # Martin A. Hansen, November 2008.
183 # Checks a BED entry for integrity and
184 # raises an error if there is a problem.
186 my ( $bed, # array ref
191 my ( $cols, @block_sizes, @block_starts );
193 $cols = scalar @{ $bed };
196 Maasha::Common::error( qq(Bad BED entry - must contain at least 3 columns - not $cols) );
200 Maasha::Common::error( qq(Bad BED entry - must contains no more than 12 columns - not $cols) );
203 if ( $bed->[ chrom ] =~ /\s/ ) {
204 Maasha::Common::error( qq(Bad BED entry - no white space allowed in chrom field: "$bed->[ chrom ]") );
207 if ( $bed->[ chromStart ] =~ /\D/ ) {
208 Maasha::Common::error( qq(Bad BED entry - chromStart must be a whole number - not "$bed->[ chromStart ]") );
211 if ( $bed->[ chromEnd ] =~ /\D/ ) {
212 Maasha::Common::error( qq(Bad BED entry - chromEnd must be a whole number - not "$bed->[ chromEnd ]") );
215 if ( $bed->[ chromEnd ] < $bed->[ chromStart ] ) {
216 Maasha::Common::error( qq(Bad BED entry - chromEnd must be greater than chromStart - not "$bed->[ chromStart ] > $bed->[ chromEnd ]") );
219 return if $cols == 3;
221 if ( $bed->[ name ] =~ /\s/ ) {
222 Maasha::Common::error( qq(Bad BED entry - no white space allowed in name field: "$bed->[ name ]") );
225 return if $cols == 4;
227 if ( $bed->[ score ] =~ /\D/ ) {
228 Maasha::Common::error( qq(Bad BED entry - score must be a whole number - not "$bed->[ score ]") );
231 # if ( $bed->[ score ] < 0 or $bed->[ score ] > 1000 ) { # disabled - too restrictive !
232 if ( $bed->[ score ] < 0 ) {
233 Maasha::Common::error( qq(Bad BED entry - score must be between 0 and 1000 - not "$bed->[ score ]") );
236 return if $cols == 5;
238 if ( $bed->[ strand ] ne '+' and $bed->[ strand ] ne '-' ) {
239 Maasha::Common::error( qq(Bad BED entry - strand must be + or - not "$bed->[ strand ]") );
242 return if $cols == 6;
244 if ( $bed->[ thickStart ] =~ /\D/ ) {
245 Maasha::Common::error( qq(Bad BED entry - thickStart must be a whole number - not "$bed->[ thickStart ]") );
248 if ( $bed->[ thickEnd ] =~ /\D/ ) {
249 Maasha::Common::error( qq(Bad BED entry - thickEnd must be a whole number - not "$bed->[ thickEnd ]") );
252 if ( $bed->[ thickEnd ] < $bed->[ thickStart ] ) {
253 Maasha::Common::error( qq(Bad BED entry - thickEnd must be greater than thickStart - not "$bed->[ thickStart ] > $bed->[ thickEnd ]") );
256 if ( $bed->[ thickStart ] < $bed->[ chromStart ] ) {
257 Maasha::Common::error( qq(Bad BED entry - thickStart must be greater than chromStart - not "$bed->[ thickStart ] < $bed->[ chromStart ]") );
260 if ( $bed->[ thickStart ] > $bed->[ chromEnd ] ) {
261 Maasha::Common::error( qq(Bad BED entry - thickStart must be less than chromEnd - not "$bed->[ thickStart ] > $bed->[ chromEnd ]") );
264 if ( $bed->[ thickEnd ] < $bed->[ chromStart ] ) {
265 Maasha::Common::error( qq(Bad BED entry - thickEnd must be greater than chromStart - not "$bed->[ thickEnd ] < $bed->[ chromStart ]") );
268 if ( $bed->[ thickEnd ] > $bed->[ chromEnd ] ) {
269 Maasha::Common::error( qq(Bad BED entry - thickEnd must be less than chromEnd - not "$bed->[ thickEnd ] > $bed->[ chromEnd ]") );
272 return if $cols == 8;
274 if ( $bed->[ itemRgb ] !~ /^(0|\d{1,3},\d{1,3},\d{1,3},?)$/ ) {
275 Maasha::Common::error( qq(Bad BED entry - itemRgb must be 0 or in the form of 255,0,0 - not "$bed->[ itemRgb ]") );
278 return if $cols == 9;
280 if ( $bed->[ blockCount ] =~ /\D/ ) {
281 Maasha::Common::error( qq(Bad BED entry - blockCount must be a whole number - not "$bed->[ blockCount ]") );
284 @block_sizes = split ",", $bed->[ blockSizes ];
286 if ( grep /\D/, @block_sizes ) {
287 Maasha::Common::error( qq(Bad BED entry - blockSizes must be whole numbers - not "$bed->[ blockSizes ]") );
290 if ( $bed->[ blockCount ] != scalar @block_sizes ) {
291 Maasha::Common::error( qq(Bad BED entry - blockSizes "$bed->[ blockSizes ]" must match blockCount "$bed->[ blockCount ]") );
294 @block_starts = split ",", $bed->[ blockStarts ];
296 if ( grep /\D/, @block_starts ) {
297 Maasha::Common::error( qq(Bad BED entry - blockStarts must be whole numbers - not "$bed->[ blockStarts ]") );
300 if ( $bed->[ blockCount ] != scalar @block_starts ) {
301 Maasha::Common::error( qq(Bad BED entry - blockStarts "$bed->[ blockStarts ]" must match blockCount "$bed->[ blockCount ]") );
304 if ( $bed->[ chromStart ] + $block_starts[ -1 ] + $block_sizes[ -1 ] != $bed->[ chromEnd ] ) {
305 Maasha::Common::error( qq(Bad BED entry - chromStart + blockStarts[last] + blockSizes[last] must equal chromEnd: ) .
306 qq($bed->[ chromStart ] + $block_starts[ -1 ] + $block_sizes[ -1 ] != $bed->[ chromEnd ]) );
313 # Martin A. hansen, November 2008.
315 # Sorts a given BED file according to a given
317 # 1: chr AND chr_beg.
318 # 2: chr AND strand AND chr_beg.
320 # 4: strand AND chr_beg.
322 # Currently 'bed_sort' is used for sorting = a standalone c program
323 # that uses qsort (unstable sort).
325 my ( $file_in, # path to BED file.
326 $scheme, # sort scheme
327 $cols, # number of columns in BED file
334 $file_out = "$file_in.sort";
336 Maasha::Common::run( "bed_sort", "--sort $scheme --cols $cols $file_in > $file_out" );
338 if ( Maasha::Filesys::file_size( $file_in ) != Maasha::Filesys::file_size( $file_out ) ) {
339 Maasha::Common::error( qq(bed_sort of file "$file_in" failed) );
342 rename $file_out, $file_in;
346 sub bed_file_split_on_chr
348 # Martin A. Hansen, November 2008.
350 # Given a path to a BED file splits
351 # this file into one file per chromosome
352 # in a temporary directory. Returns a hash
353 # with chromosome name as key and the corresponding
356 my ( $file, # path to BED file
357 $dir, # working directory
358 $cols, # number of BED columns to read - OPTIONAL
363 my ( $fh_in, $fh_out, $entry, %fh_hash, %file_hash );
365 $fh_in = Maasha::Filesys::file_read_open( $file );
367 while ( $entry = bed_entry_get( $fh_in, $cols ) )
369 if ( not exists $file_hash{ $entry->[ chrom ] } )
371 $fh_hash{ $entry->[ chrom ] } = Maasha::Filesys::file_write_open( "$dir/$entry->[ chrom ]" );
372 $file_hash{ $entry->[ chrom ] } = "$dir/$entry->[ chrom ]";
375 $fh_out = $fh_hash{ $entry->[ chrom ] };
377 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $cols );
380 map { close $_ } keys %fh_hash;
382 return wantarray ? %file_hash : \%file_hash;
386 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BIOPIECES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
391 # Martin A. Hansen, November 2008.
393 # Converts a BED entry given as an arrayref
394 # to a Biopiece record which is returned as
397 # IMPORTANT! The BED_END and THICK_END positions
398 # will be the exact position in contrary to the
401 my ( $bed_entry, # BED entry as arrayref
406 my ( $cols, %bp_record );
408 $cols = scalar @{ $bed_entry };
410 if ( not defined $bed_entry->[ chrom ] and
411 not defined $bed_entry->[ chromStart ] and
412 not defined $bed_entry->[ chromEnd ] )
417 $bp_record{ "REC_TYPE" } = "BED";
418 $bp_record{ "BED_COLS" } = $cols;
419 $bp_record{ "CHR" } = $bed_entry->[ chrom ];
420 $bp_record{ "CHR_BEG" } = $bed_entry->[ chromStart ];
421 $bp_record{ "CHR_END" } = $bed_entry->[ chromEnd ] - 1;
422 $bp_record{ "BED_LEN" } = $bed_entry->[ chromEnd ] - $bed_entry->[ chromStart ];
424 return wantarray ? %bp_record : \%bp_record if $cols == 3;
426 $bp_record{ "Q_ID" } = $bed_entry->[ name ];
428 return wantarray ? %bp_record : \%bp_record if $cols == 4;
430 $bp_record{ "SCORE" } = $bed_entry->[ score ];
432 return wantarray ? %bp_record : \%bp_record if $cols == 5;
434 $bp_record{ "STRAND" } = $bed_entry->[ strand ];
436 return wantarray ? %bp_record : \%bp_record if $cols == 6;
438 $bp_record{ "THICK_BEG" } = $bed_entry->[ thickStart ];
439 $bp_record{ "THICK_END" } = $bed_entry->[ thickEnd ] - 1;
441 return wantarray ? %bp_record : \%bp_record if $cols == 8;
443 $bp_record{ "COLOR" } = $bed_entry->[ itemRgb ];
445 return wantarray ? %bp_record : \%bp_record if $cols == 9;
447 $bp_record{ "BLOCK_COUNT" } = $bed_entry->[ blockCount ];
448 $bp_record{ "BLOCK_LENS" } = $bed_entry->[ blockSizes ];
449 $bp_record{ "Q_BEGS" } = $bed_entry->[ blockStarts ];
451 return wantarray ? %bp_record : \%bp_record;
457 # Martin A. Hansen, November 2008.
459 # Converts a Biopieces record given as a hashref
460 # to a BED record which is returned as
461 # an arrayref. As much as possible of the Biopiece
462 # record is converted and undef is returned if
465 # IMPORTANT! The BED_END and THICK_END positions
466 # will be the inexact position used in the
469 my ( $bp_record, # hashref
470 $cols, # number of columns in BED entry - OPTIONAL (but faster)
475 my ( @bed_entry, @begs );
477 $cols ||= 12; # max number of columns possible
479 $bed_entry[ chrom ] = $bp_record->{ "CHR" } ||
480 $bp_record->{ "S_ID" } ||
483 if ( defined $bp_record->{ "CHR_BEG" } ) {
484 $bed_entry[ chromStart ] = $bp_record->{ "CHR_BEG" };
485 } elsif ( defined $bp_record->{ "S_BEG" } ) {
486 $bed_entry[ chromStart ] = $bp_record->{ "S_BEG" };
491 if ( defined $bp_record->{ "CHR_END" } ) {
492 $bed_entry[ chromEnd ] = $bp_record->{ "CHR_END" };
493 } elsif ( defined $bp_record->{ "S_END" }) {
494 $bed_entry[ chromEnd ] = $bp_record->{ "S_END" };
499 $bed_entry[ chromEnd ]++;
501 return wantarray ? @bed_entry : \@bed_entry if $cols == 3;
503 $bed_entry[ name ] = $bp_record->{ "Q_ID" } || return wantarray ? @bed_entry : \@bed_entry;
505 return wantarray ? @bed_entry : \@bed_entry if $cols == 4;
507 if ( exists $bp_record->{ "SCORE" } ) {
508 $bed_entry[ score ] = $bp_record->{ "SCORE" };
510 return wantarray ? @bed_entry : \@bed_entry;
513 return wantarray ? @bed_entry : \@bed_entry if $cols == 5;
515 if ( exists $bp_record->{ "STRAND" } ) {
516 $bed_entry[ strand ] = $bp_record->{ "STRAND" };
518 return wantarray ? @bed_entry : \@bed_entry;
521 return wantarray ? @bed_entry : \@bed_entry if $cols == 6;
523 if ( defined $bp_record->{ "THICK_BEG" } and
524 defined $bp_record->{ "THICK_END" } )
526 $bed_entry[ thickStart ] = $bp_record->{ "THICK_BEG" };
527 $bed_entry[ thickEnd ] = $bp_record->{ "THICK_END" } + 1;
529 elsif ( defined $bp_record->{ "BLOCK_COUNT" } )
531 $bed_entry[ thickStart ] = $bed_entry[ chromStart ];
532 $bed_entry[ thickEnd ] = $bed_entry[ chromEnd ];
535 return wantarray ? @bed_entry : \@bed_entry if $cols == 8;
537 if ( defined $bp_record->{ "COLOR" } )
539 $bed_entry[ itemRgb ] = $bp_record->{ "COLOR" };
541 elsif ( defined $bp_record->{ "BLOCK_COUNT" } )
543 $bed_entry[ itemRgb ] = 0;
546 return wantarray ? @bed_entry : \@bed_entry if $cols == 9;
548 if ( defined $bp_record->{ "BLOCK_COUNT" } and
549 defined $bp_record->{ "BLOCK_LENS" } and
550 defined $bp_record->{ "S_BEGS" } )
552 @begs = split ",", $bp_record->{ "S_BEGS" };
553 map { $_ -= $bed_entry[ chromStart ] } @begs;
555 $bed_entry[ blockCount ] = $bp_record->{ "BLOCK_COUNT" };
556 $bed_entry[ blockSizes ] = $bp_record->{ "BLOCK_LENS" };
557 $bed_entry[ blockStarts ] = join ",", @begs;
558 $bed_entry[ thickEnd ] = $bp_record->{ "THICK_END" } + 1;
560 elsif ( defined $bp_record->{ "BLOCK_COUNT" } and
561 defined $bp_record->{ "BLOCK_LENS" } and
562 defined $bp_record->{ "Q_BEGS" } )
564 $bed_entry[ blockCount ] = $bp_record->{ "BLOCK_COUNT" };
565 $bed_entry[ blockSizes ] = $bp_record->{ "BLOCK_LENS" };
566 $bed_entry[ blockStarts ] = $bp_record->{ "Q_BEGS" };
567 $bed_entry[ thickEnd ] = $bp_record->{ "THICK_END" } + 1;
570 return wantarray ? @bed_entry : \@bed_entry;
574 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> TAG CONTIGS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
577 sub tag_contigs_assemble
579 # Martin A. Hansen, November 2008.
581 # Returns a path to a BED file with tab contigs.
583 my ( $bed_file, # path to BED file
586 $dir, # working directory
591 my ( $fh_in, $fh_out, $array, $new_file, $pos, $id, $beg, $end, $score, $entry );
593 $fh_in = Maasha::Filesys::file_read_open( $bed_file );
595 $array = tag_contigs_assemble_array( $fh_in );
599 $new_file = "$bed_file.tag_contigs";
601 $fh_out = Maasha::Filesys::file_write_open( $new_file );
606 while ( ( $beg, $end, $score ) = tag_contigs_scan( $array, $pos ) and $beg )
608 $entry->[ chrom ] = $chr;
609 $entry->[ chromStart ] = $beg;
610 $entry->[ chromEnd ] = $end;
611 $entry->[ name ] = sprintf( "TC%06d", $id );
612 $entry->[ score ] = $score;
613 $entry->[ strand ] = $strand;
615 bed_entry_put( $entry, $fh_out );
627 sub tag_contigs_assemble_array
629 # Martin A. Hansen, November 2008.
631 # Given a BED file with entries from only one
632 # chromosome assembles tag contigs from these
633 # ignoring strand information. Only tags with
634 # a score higher than the clone count over
635 # genomic loci (the score field) is included
636 # in the tag contigs.
642 # ======================== tag contig
645 my ( $fh, # file handle to BED file
648 # Returns an arrayref.
650 my ( $entry, $clones, $score, @array );
652 while ( $entry = bed_entry_get( $fh ) )
654 if ( $entry->[ name ] =~ /_(\d+)$/ )
658 if ( $entry->[ score ] )
660 $score = int( $clones / $entry->[ score ] );
662 map { $array[ $_ ] += $score } $entry->[ chromStart ] .. $entry->[ chromEnd ] - 1 if $score >= 1;
667 return wantarray ? @array : \@array;
673 # Martin A. Hansen, November 2008.
675 # Scans an array with tag contigs and locates
676 # the next contig from a given position. The
677 # score of the tag contig is determined as the
678 # maximum value of the tag contig. If a tag contig
679 # is found a triple is returned with beg, end and score
680 # otherwise an empty list is returned.
682 my ( $array, # array to scan
683 $beg, # position to start scanning from
686 # Returns an arrayref.
692 while ( $beg < scalar @{ $array } and not $array->[ $beg ] ) { $beg++ }
696 while ( $array->[ $end ] )
698 $score = Maasha::Calc::max( $score, $array->[ $end ] );
704 return wantarray ? ( $beg, $end, $score ) : [ $beg, $end, $score ];
706 return wantarray ? () : [];
711 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<