1 package Maasha::UCSC::BED;
3 # Copyright (C) 2007-2008 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Routines for interaction with Browser Extensible DATA (BED) entries and files.
27 # Read about the BED format here: http://genome.ucsc.edu/FAQ/FAQformat#format1
30 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
40 use vars qw( @ISA @EXPORT_OK );
44 @ISA = qw( Exporter );
50 chrom => 0, # BED field names
62 CHR => 0, # Biopieces field names
77 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BED format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
80 # Hash for converting BED keys to Biopieces keys.
84 chromStart => "CHR_BEG",
85 chromEnd => "CHR_END",
89 thickStart => "THICK_BEG",
90 thickEnd => "THICK_END",
92 blockCount => "BLOCK_COUNT",
93 blockSizes => "BLOCK_LENS",
94 blockStarts => "Q_BEGS",
98 # Hash for converting biopieces keys to BED keys.
100 my %BIOPIECES2BED = (
102 CHR_BEG => "chromStart",
103 CHR_END => "chromEnd",
107 THICK_BEG => "thickStart",
108 THICK_END => "thickEnd",
110 BLOCK_COUNT => "blockCount",
111 BLOCK_LENS => "blockSizes",
112 Q_BEGS => "blockStarts",
118 # Martin A. Hansen, September 2008.
120 # Reads a BED entry given a filehandle.
122 my ( $fh, # file handle
123 $cols, # columns to read - OPTIONAL (3,4,5,6,8,9 or 12)
124 $check, # check integrity of BED values - OPTIONAL
129 my ( $line, @entry );
133 $line =~ tr/\n\r//d; # some people have carriage returns in their BED files -> Grrrr
135 return if not defined $line;
137 if ( not defined $cols ) {
138 $cols = 1 + $line =~ tr/\t//;
141 @entry = split "\t", $line, $cols + 1;
143 pop @entry if scalar @entry > $cols;
145 bed_entry_check( \@entry ) if $check;
147 return wantarray ? @entry : \@entry;
153 # Martin A. Hansen, September 2008.
155 # Writes a BED entry array to file.
158 $fh, # file handle - OPTIONAL
159 $cols, # number of columns in BED file - OPTIONAL (3,4,5,6,8,9 or 12)
160 $check, # check integrity of BED values - OPTIONAL
165 if ( $cols and $cols < scalar @{ $entry } ) {
166 @{ $entry } = @{ $entry }[ 0 .. $cols - 1 ];
169 bed_entry_check( $entry ) if $check;
171 $fh = \*STDOUT if not defined $fh;
173 print $fh join( "\t", @{ $entry } ), "\n";
180 # Martin A. Hansen, November 2008.
182 # Checks a BED entry for integrity and
183 # raises an error if there is a problem.
185 my ( $bed, # array ref
190 my ( $cols, @block_sizes, @block_starts );
192 $cols = scalar @{ $bed };
195 Maasha::Common::error( qq(Bad BED entry - must contain at least 3 columns - not $cols) );
199 Maasha::Common::error( qq(Bad BED entry - must contains no more than 12 columns - not $cols) );
202 if ( $bed->[ chrom ] =~ /\s/ ) {
203 Maasha::Common::error( qq(Bad BED entry - no white space allowed in chrom field: "$bed->[ chrom ]") );
206 if ( $bed->[ chromStart ] =~ /\D/ ) {
207 Maasha::Common::error( qq(Bad BED entry - chromStart must be a whole number - not "$bed->[ chromStart ]") );
210 if ( $bed->[ chromEnd ] =~ /\D/ ) {
211 Maasha::Common::error( qq(Bad BED entry - chromEnd must be a whole number - not "$bed->[ chromEnd ]") );
214 if ( $bed->[ chromEnd ] < $bed->[ chromStart ] ) {
215 Maasha::Common::error( qq(Bad BED entry - chromEnd must be greater than chromStart - not "$bed->[ chromStart ] > $bed->[ chromEnd ]") );
218 return if $cols == 3;
220 if ( $bed->[ name ] =~ /\s/ ) {
221 Maasha::Common::error( qq(Bad BED entry - no white space allowed in name field: "$bed->[ name ]") );
224 return if $cols == 4;
226 if ( $bed->[ score ] =~ /\D/ ) {
227 Maasha::Common::error( qq(Bad BED entry - score must be a whole number - not "$bed->[ score ]") );
230 # if ( $bed->[ score ] < 0 or $bed->[ score ] > 1000 ) { # disabled - too restrictive !
231 if ( $bed->[ score ] < 0 ) {
232 Maasha::Common::error( qq(Bad BED entry - score must be between 0 and 1000 - not "$bed->[ score ]") );
235 return if $cols == 5;
237 if ( $bed->[ strand ] ne '+' and $bed->[ strand ] ne '-' ) {
238 Maasha::Common::error( qq(Bad BED entry - strand must be + or - not "$bed->[ strand ]") );
241 return if $cols == 6;
243 if ( $bed->[ thickStart ] =~ /\D/ ) {
244 Maasha::Common::error( qq(Bad BED entry - thickStart must be a whole number - not "$bed->[ thickStart ]") );
247 if ( $bed->[ thickEnd ] =~ /\D/ ) {
248 Maasha::Common::error( qq(Bad BED entry - thickEnd must be a whole number - not "$bed->[ thickEnd ]") );
251 if ( $bed->[ thickEnd ] < $bed->[ thickStart ] ) {
252 Maasha::Common::error( qq(Bad BED entry - thickEnd must be greater than thickStart - not "$bed->[ thickStart ] > $bed->[ thickEnd ]") );
255 if ( $bed->[ thickStart ] < $bed->[ chromStart ] ) {
256 Maasha::Common::error( qq(Bad BED entry - thickStart must be greater than chromStart - not "$bed->[ thickStart ] < $bed->[ chromStart ]") );
259 if ( $bed->[ thickStart ] > $bed->[ chromEnd ] ) {
260 Maasha::Common::error( qq(Bad BED entry - thickStart must be less than chromEnd - not "$bed->[ thickStart ] > $bed->[ chromEnd ]") );
263 if ( $bed->[ thickEnd ] < $bed->[ chromStart ] ) {
264 Maasha::Common::error( qq(Bad BED entry - thickEnd must be greater than chromStart - not "$bed->[ thickEnd ] < $bed->[ chromStart ]") );
267 if ( $bed->[ thickEnd ] > $bed->[ chromEnd ] ) {
268 Maasha::Common::error( qq(Bad BED entry - thickEnd must be less than chromEnd - not "$bed->[ thickEnd ] > $bed->[ chromEnd ]") );
271 return if $cols == 8;
273 if ( $bed->[ itemRgb ] !~ /^(0|\d{1,3},\d{1,3},\d{1,3},?)$/ ) {
274 Maasha::Common::error( qq(Bad BED entry - itemRgb must be 0 or in the form of 255,0,0 - not "$bed->[ itemRgb ]") );
277 return if $cols == 9;
279 if ( $bed->[ blockCount ] =~ /\D/ ) {
280 Maasha::Common::error( qq(Bad BED entry - blockCount must be a whole number - not "$bed->[ blockCount ]") );
283 @block_sizes = split ",", $bed->[ blockSizes ];
285 if ( grep /\D/, @block_sizes ) {
286 Maasha::Common::error( qq(Bad BED entry - blockSizes must be whole numbers - not "$bed->[ blockSizes ]") );
289 if ( $bed->[ blockCount ] != scalar @block_sizes ) {
290 Maasha::Common::error( qq(Bad BED entry - blockSizes "$bed->[ blockSizes ]" must match blockCount "$bed->[ blockCount ]") );
293 @block_starts = split ",", $bed->[ blockStarts ];
295 if ( grep /\D/, @block_starts ) {
296 Maasha::Common::error( qq(Bad BED entry - blockStarts must be whole numbers - not "$bed->[ blockStarts ]") );
299 if ( $bed->[ blockCount ] != scalar @block_starts ) {
300 Maasha::Common::error( qq(Bad BED entry - blockStarts "$bed->[ blockStarts ]" must match blockCount "$bed->[ blockCount ]") );
303 if ( $bed->[ chromStart ] + $block_starts[ -1 ] + $block_sizes[ -1 ] != $bed->[ chromEnd ] ) {
304 Maasha::Common::error( qq(Bad BED entry - chromStart + blockStarts[last] + blockSizes[last] must equal chromEnd: ) .
305 qq($bed->[ chromStart ] + $block_starts[ -1 ] + $block_sizes[ -1 ] != $bed->[ chromEnd ]) );
312 # Martin A. hansen, November 2008.
314 # Sorts a given BED file according to a given
316 # 1: chr AND chr_beg.
317 # 2: chr AND strand AND chr_beg.
319 # 4: strand AND chr_beg.
321 # Currently 'bed_sort' is used for sorting = a standalone c program
322 # that uses qsort (unstable sort).
324 my ( $file_in, # path to BED file.
325 $scheme, # sort scheme
326 $cols, # number of columns in BED file
333 $file_out = "$file_in.sort";
335 Maasha::Common::run( "bed_sort", "--sort $scheme --cols $cols $file_in > $file_out" );
337 if ( Maasha::Filesys::file_size( $file_in ) != Maasha::Filesys::file_size( $file_out ) ) {
338 Maasha::Common::error( qq(bed_sort of file "$file_in" failed) );
341 rename $file_out, $file_in;
345 sub bed_file_split_on_chr
347 # Martin A. Hansen, November 2008.
349 # Given a path to a BED file splits
350 # this file into one file per chromosome
351 # in a temporary directory. Returns a hash
352 # with chromosome name as key and the corresponding
355 my ( $file, # path to BED file
356 $dir, # working directory
357 $cols, # number of BED columns to read - OPTIONAL
362 my ( $fh_in, $fh_out, $entry, %fh_hash, %file_hash );
364 $fh_in = Maasha::Filesys::file_read_open( $file );
366 while ( $entry = bed_entry_get( $fh_in, $cols ) )
368 if ( not exists $file_hash{ $entry->[ chrom ] } )
370 $fh_hash{ $entry->[ chrom ] } = Maasha::Filesys::file_write_open( "$dir/$entry->[ chrom ]" );
371 $file_hash{ $entry->[ chrom ] } = "$dir/$entry->[ chrom ]";
374 $fh_out = $fh_hash{ $entry->[ chrom ] };
376 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $cols );
379 map { close $_ } keys %fh_hash;
381 return wantarray ? %file_hash : \%file_hash;
385 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BIOPIECES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
390 # Martin A. Hansen, November 2008.
392 # Converts a BED entry given as an arrayref
393 # to a Biopiece record which is returned as
396 # IMPORTANT! The BED_END and THICK_END positions
397 # will be the exact position in contrary to the
400 my ( $bed_entry, # BED entry as arrayref
405 my ( $cols, %bp_record );
407 $cols = scalar @{ $bed_entry };
409 if ( not defined $bed_entry->[ chrom ] and
410 not defined $bed_entry->[ chromStart ] and
411 not defined $bed_entry->[ chromEnd ] )
416 $bp_record{ "REC_TYPE" } = "BED";
417 $bp_record{ "BED_COLS" } = $cols;
418 $bp_record{ "CHR" } = $bed_entry->[ chrom ];
419 $bp_record{ "CHR_BEG" } = $bed_entry->[ chromStart ];
420 $bp_record{ "CHR_END" } = $bed_entry->[ chromEnd ] - 1;
421 $bp_record{ "BED_LEN" } = $bed_entry->[ chromEnd ] - $bed_entry->[ chromStart ];
423 return wantarray ? %bp_record : \%bp_record if $cols == 3;
425 $bp_record{ "Q_ID" } = $bed_entry->[ name ];
427 return wantarray ? %bp_record : \%bp_record if $cols == 4;
429 $bp_record{ "SCORE" } = $bed_entry->[ score ];
431 return wantarray ? %bp_record : \%bp_record if $cols == 5;
433 $bp_record{ "STRAND" } = $bed_entry->[ strand ];
435 return wantarray ? %bp_record : \%bp_record if $cols == 6;
437 $bp_record{ "THICK_BEG" } = $bed_entry->[ thickStart ];
438 $bp_record{ "THICK_END" } = $bed_entry->[ thickEnd ] - 1;
440 return wantarray ? %bp_record : \%bp_record if $cols == 8;
442 $bp_record{ "COLOR" } = $bed_entry->[ itemRgb ];
444 return wantarray ? %bp_record : \%bp_record if $cols == 9;
446 $bp_record{ "BLOCK_COUNT" } = $bed_entry->[ blockCount ];
447 $bp_record{ "BLOCK_LENS" } = $bed_entry->[ blockSizes ];
448 $bp_record{ "Q_BEGS" } = $bed_entry->[ blockStarts ];
450 return wantarray ? %bp_record : \%bp_record;
456 # Martin A. Hansen, November 2008.
458 # Converts a Biopieces record given as a hashref
459 # to a BED record which is returned as
460 # an arrayref. As much as possible of the Biopiece
461 # record is converted and undef is returned if
464 # IMPORTANT! The BED_END and THICK_END positions
465 # will be the inexact position used in the
468 my ( $bp_record, # hashref
469 $cols, # number of columns in BED entry - OPTIONAL (but faster)
476 $cols ||= 12; # max number of columns possible
478 $bed_entry[ chrom ] = $bp_record->{ "CHR" } ||
479 $bp_record->{ "S_ID" } ||
482 if ( defined $bp_record->{ "CHR_BEG" } ) {
483 $bed_entry[ chromStart ] = $bp_record->{ "CHR_BEG" };
484 } elsif ( defined $bp_record->{ "S_BEG" } ) {
485 $bed_entry[ chromStart ] = $bp_record->{ "S_BEG" };
490 if ( defined $bp_record->{ "CHR_END" } ) {
491 $bed_entry[ chromEnd ] = $bp_record->{ "CHR_END" };
492 } elsif ( defined $bp_record->{ "S_END" }) {
493 $bed_entry[ chromEnd ] = $bp_record->{ "S_END" };
498 $bed_entry[ chromEnd ]++;
500 return wantarray ? @bed_entry : \@bed_entry if $cols == 3;
502 $bed_entry[ name ] = $bp_record->{ "Q_ID" } || return wantarray ? @bed_entry : \@bed_entry;
504 return wantarray ? @bed_entry : \@bed_entry if $cols == 4;
506 if ( exists $bp_record->{ "SCORE" } ) {
507 $bed_entry[ score ] = $bp_record->{ "SCORE" };
509 return wantarray ? @bed_entry : \@bed_entry;
512 return wantarray ? @bed_entry : \@bed_entry if $cols == 5;
514 if ( exists $bp_record->{ "STRAND" } ) {
515 $bed_entry[ strand ] = $bp_record->{ "STRAND" };
517 return wantarray ? @bed_entry : \@bed_entry;
520 return wantarray ? @bed_entry : \@bed_entry if $cols == 6;
522 if ( defined $bp_record->{ "THICK_BEG" } and
523 defined $bp_record->{ "THICK_END" } )
525 $bed_entry[ thickStart ] = $bp_record->{ "THICK_BEG" };
526 $bed_entry[ thickEnd ] = $bp_record->{ "THICK_END" };
529 return wantarray ? @bed_entry : \@bed_entry if $cols == 8;
531 if ( exists $bp_record->{ "COLOR" } ) {
532 $bed_entry[ itemRgb ] = $bp_record->{ "COLOR" };
534 return wantarray ? @bed_entry : \@bed_entry;
537 return wantarray ? @bed_entry : \@bed_entry if $cols == 9;
539 if ( defined $bp_record->{ "BLOCK_COUNT" } and
540 defined $bp_record->{ "BLOCK_LENS" } and
541 defined $bp_record->{ "Q_BEGS" } )
543 $bed_entry[ blockCount ] = $bp_record->{ "BLOCK_COUNT" };
544 $bed_entry[ blockSizes ] = $bp_record->{ "BLOCK_LENS" };
545 $bed_entry[ blockStarts ] = $bp_record->{ "Q_BEGS" };
546 $bed_entry[ thickEnd ]++;
549 return wantarray ? @bed_entry : \@bed_entry;
553 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> TAG CONTIGS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
556 sub tag_contigs_assemble
558 # Martin A. Hansen, November 2008.
560 # Returns a path to a BED file with tab contigs.
562 my ( $bed_file, # path to BED file
565 $dir, # working directory
570 my ( $fh_in, $fh_out, $array, $new_file, $pos, $id, $beg, $end, $score, $entry );
572 $fh_in = Maasha::Filesys::file_read_open( $bed_file );
574 $array = tag_contigs_assemble_array( $fh_in );
578 $new_file = "$bed_file.tag_contigs";
580 $fh_out = Maasha::Filesys::file_write_open( $new_file );
585 while ( ( $beg, $end, $score ) = tag_contigs_scan( $array, $pos ) and $beg )
587 $entry->[ chrom ] = $chr;
588 $entry->[ chromStart ] = $beg;
589 $entry->[ chromEnd ] = $end;
590 $entry->[ name ] = sprintf( "TC%06d", $id );
591 $entry->[ score ] = $score;
592 $entry->[ strand ] = $strand;
594 bed_entry_put( $entry, $fh_out );
606 sub tag_contigs_assemble_array
608 # Martin A. Hansen, November 2008.
610 # Given a BED file with entries from only one
611 # chromosome assembles tag contigs from these
612 # ignoring strand information. Only tags with
613 # a score higher than the clone count over
614 # genomic loci (the score field) is included
615 # in the tag contigs.
621 # ======================== tag contig
624 my ( $fh, # file handle to BED file
627 # Returns an arrayref.
629 my ( $entry, $clones, $score, @array );
631 while ( $entry = bed_entry_get( $fh ) )
633 if ( $entry->[ name ] =~ /_(\d+)$/ )
637 if ( $entry->[ score ] )
639 $score = int( $clones / $entry->[ score ] );
641 map { $array[ $_ ] += $score } $entry->[ chromStart ] .. $entry->[ chromEnd ] - 1 if $score >= 1;
646 return wantarray ? @array : \@array;
652 # Martin A. Hansen, November 2008.
654 # Scans an array with tag contigs and locates
655 # the next contig from a given position. The
656 # score of the tag contig is determined as the
657 # maximum value of the tag contig. If a tag contig
658 # is found a triple is returned with beg, end and score
659 # otherwise an empty list is returned.
661 my ( $array, # array to scan
662 $beg, # position to start scanning from
665 # Returns an arrayref.
671 while ( $beg < scalar @{ $array } and not $array->[ $beg ] ) { $beg++ }
675 while ( $array->[ $end ] )
677 $score = Maasha::Calc::max( $score, $array->[ $end ] );
683 return wantarray ? ( $beg, $end, $score ) : [ $beg, $end, $score ];
685 return wantarray ? () : [];
690 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<