1 package Maasha::UCSC::BED;
3 # Copyright (C) 2007-2008 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Routines for interaction with Browser Extensible DATA (BED) entries and files.
27 # Read about the BED format here: http://genome.ucsc.edu/FAQ/FAQformat#format1
30 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
40 use vars qw( @ISA @EXPORT_OK );
44 @ISA = qw( Exporter );
50 chrom => 0, # BED field names
62 CHR => 0, # Biopieces field names
77 my $CHECK_ALL = 1; # Global flag indicating that BED input and output is checked thoroughly.
80 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BED format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
83 # Hash for converting BED keys to Biopieces keys.
87 chromStart => "CHR_BEG",
88 chromEnd => "CHR_END",
92 thickStart => "THICK_BEG",
93 thickEnd => "THICK_END",
95 blockCount => "BLOCK_COUNT",
96 blockSizes => "BLOCK_LENS",
97 blockStarts => "Q_BEGS",
101 # Hash for converting biopieces keys to BED keys.
103 my %BIOPIECES2BED = (
105 CHR_BEG => "chromStart",
106 CHR_END => "chromEnd",
110 THICK_BEG => "thickStart",
111 THICK_END => "thickEnd",
113 BLOCK_COUNT => "blockCount",
114 BLOCK_LENS => "blockSizes",
115 Q_BEGS => "blockStarts",
121 # Martin A. Hansen, September 2008.
123 # Reads a BED entry given a filehandle.
125 my ( $fh, # file handle
126 $cols, # columns to read - OPTIONAL (3,4,5,6,8,9 or 12)
131 my ( $line, @entry );
135 $line =~ tr/\n\r//d; # some people have carriage returns in their BED files -> Grrrr
137 return if not defined $line;
139 if ( not defined $cols ) {
140 $cols = 1 + $line =~ tr/\t//;
143 @entry = split "\t", $line, $cols + 1;
145 pop @entry if scalar @entry > $cols;
147 bed_entry_check( \@entry ) if $CHECK_ALL;
149 return wantarray ? @entry : \@entry;
155 # Martin A. Hansen, September 2008.
157 # Writes a BED entry array to file.
160 $fh, # file handle - OPTIONAL
161 $cols, # number of columns in BED file - OPTIONAL (3,4,5,6,8,9 or 12)
166 if ( $cols and $cols < scalar @{ $entry } ) {
167 @{ $entry } = @{ $entry }[ 0 .. $cols - 1 ];
170 bed_entry_check( $entry ) if $CHECK_ALL;
172 $fh = \*STDOUT if not defined $fh;
174 print $fh join( "\t", @{ $entry } ), "\n";
181 # Martin A. Hansen, November 2008.
183 # Checks a BED entry for integrity and
184 # raises an error if there is a problem.
186 my ( $bed, # array ref
191 my ( $cols, @block_sizes, @block_starts );
193 $cols = scalar @{ $bed };
196 Maasha::Common::error( qq(Bad BED entry - must contain at least 3 columns - not $cols) );
200 Maasha::Common::error( qq(Bad BED entry - must contains no more than 12 columns - not $cols) );
203 if ( $bed->[ chrom ] =~ /\s/ ) {
204 Maasha::Common::error( qq(Bad BED entry - no white space allowed in chrom field: "$bed->[ chrom ]") );
207 if ( $bed->[ chromStart ] =~ /\D/ ) {
208 Maasha::Common::error( qq(Bad BED entry - chromStart must be a whole number - not "$bed->[ chromStart ]") );
211 if ( $bed->[ chromEnd ] =~ /\D/ ) {
212 Maasha::Common::error( qq(Bad BED entry - chromEnd must be a whole number - not "$bed->[ chromEnd ]") );
215 if ( $bed->[ chromEnd ] < $bed->[ chromStart ] ) {
216 Maasha::Common::error( qq(Bad BED entry - chromEnd must be greater than chromStart - not "$bed->[ chromStart ] > $bed->[ chromEnd ]") );
219 return if @{ $bed } == 3;
221 if ( $bed->[ name ] =~ /\s/ ) {
222 Maasha::Common::error( qq(Bad BED entry - no white space allowed in name field: "$bed->[ name ]") );
225 return if @{ $bed } == 4;
227 if ( $bed->[ score ] =~ /\D/ ) {
228 Maasha::Common::error( qq(Bad BED entry - score must be a whole number - not "$bed->[ score ]") );
231 if ( $bed->[ score ] < 0 or $bed->[ score ] > 1000 ) {
232 Maasha::Common::error( qq(Bad BED entry - score must be between 0 and 1000 - not "$bed->[ score ]") );
235 return if @{ $bed } == 5;
237 if ( $bed->[ strand ] ne '+' and $bed->[ strand ] ne '-' ) {
238 Maasha::Common::error( qq(Bad BED entry - strand must be + or - not "$bed->[ strand ]") );
241 return if @{ $bed } == 6;
243 if ( $bed->[ thickStart ] =~ /\D/ ) {
244 Maasha::Common::error( qq(Bad BED entry - thickStart must be a whole number - not "$bed->[ thickStart ]") );
247 if ( $bed->[ thickEnd ] =~ /\D/ ) {
248 Maasha::Common::error( qq(Bad BED entry - thickEnd must be a whole number - not "$bed->[ thickEnd ]") );
251 if ( $bed->[ thickEnd ] < $bed->[ thickStart ] ) {
252 Maasha::Common::error( qq(Bad BED entry - thickEnd must be greater than thickStart - not "$bed->[ thickStart ] > $bed->[ thickEnd ]") );
255 if ( $bed->[ thickStart ] < $bed->[ chromStart ] ) {
256 Maasha::Common::error( qq(Bad BED entry - thickStart must be greater than chromStart - not "$bed->[ thickStart ] < $bed->[ chromStart ]") );
259 if ( $bed->[ thickStart ] > $bed->[ chromEnd ] ) {
260 Maasha::Common::error( qq(Bad BED entry - thickStart must be less than chromEnd - not "$bed->[ thickStart ] > $bed->[ chromEnd ]") );
263 if ( $bed->[ thickEnd ] < $bed->[ chromStart ] ) {
264 Maasha::Common::error( qq(Bad BED entry - thickEnd must be greater than chromStart - not "$bed->[ thickEnd ] < $bed->[ chromStart ]") );
267 if ( $bed->[ thickEnd ] > $bed->[ chromEnd ] ) {
268 Maasha::Common::error( qq(Bad BED entry - thickEnd must be less than chromEnd - not "$bed->[ thickEnd ] > $bed->[ chromEnd ]") );
271 return if @{ $bed } == 8;
273 if ( $bed->[ itemRgb ] !~ /^(0|\d{1,3},\d{1,3},\d{1,3},?)$/ ) {
274 Maasha::Common::error( qq(Bad BED entry - itemRgb must be 0 or in the form of 255,0,0 - not "$bed->[ itemRgb ]") );
277 return if @{ $bed } == 9;
279 if ( $bed->[ blockCount ] =~ /\D/ ) {
280 Maasha::Common::error( qq(Bad BED entry - blockCount must be a whole number - not "$bed->[ blockCount ]") );
283 @block_sizes = split ",", $bed->[ blockSizes ];
285 if ( grep /\D/, @block_sizes ) {
286 Maasha::Common::error( qq(Bad BED entry - blockSizes must be whole numbers - not "$bed->[ blockSizes ]") );
289 if ( $bed->[ blockCount ] != scalar @block_sizes ) {
290 Maasha::Common::error( qq(Bad BED entry - blockSizes "$bed->[ blockSizes ]" must match blockCount "$bed->[ blockCount ]") );
293 @block_starts = split ",", $bed->[ blockStarts ];
295 if ( grep /\D/, @block_starts ) {
296 Maasha::Common::error( qq(Bad BED entry - blockStarts must be whole numbers - not "$bed->[ blockStarts ]") );
299 if ( $bed->[ blockCount ] != scalar @block_starts ) {
300 Maasha::Common::error( qq(Bad BED entry - blockStarts "$bed->[ blockStarts ]" must match blockCount "$bed->[ blockCount ]") );
303 if ( $bed->[ chromStart ] + $block_starts[ -1 ] + $block_sizes[ -1 ] != $bed->[ chromEnd ] ) {
304 Maasha::Common::error( qq(Bad BED entry - chromStart + blockStarts[last] + blockSizes[last] must equal chromEnd: ) .
305 qq($bed->[ chromStart ] + $block_starts[ -1 ] + $block_sizes[ -1 ] != $bed->[ chromEnd ]) );
312 # Martin A. hansen, November 2008.
314 # Sorts a given BED file according to a given
316 # 1: chr AND chr_beg.
317 # 2: chr AND strand AND chr_beg.
319 # 4: strand AND chr_beg.
321 # Currently 'bed_sort' is used for sorting = a standalone c program
322 # that uses qsort (unstable sort).
324 my ( $file_in, # path to BED file.
325 $scheme, # sort scheme
326 $cols, # number of columns in BED file
333 $file_out = "$file_in.sort";
335 Maasha::Common::run( "bed_sort", "--sort $scheme --cols $cols $file_in > $file_out" );
337 if ( Maasha::Filesys::file_size( $file_in ) != Maasha::Filesys::file_size( $file_out ) ) {
338 Maasha::Common::error( qq(bed_sort of file "$file_in" failed) );
341 rename $file_out, $file_in;
345 sub bed_file_split_on_chr
347 # Martin A. Hansen, November 2008.
349 # Given a path to a BED file splits
350 # this file into one file per chromosome
351 # in a temporary directory. Returns a hash
352 # with chromosome name as key and the corresponding
355 my ( $file, # path to BED file
356 $dir, # working directory
357 $cols, # number of BED columns to read - OPTIONAL
362 my ( $fh_in, $fh_out, $entry, %fh_hash, %file_hash );
364 $fh_in = Maasha::Filesys::file_read_open( $file );
366 while ( $entry = bed_entry_get( $fh_in, $cols ) )
368 if ( not exists $file_hash{ $entry->[ chrom ] } )
370 $fh_hash{ $entry->[ chrom ] } = Maasha::Filesys::file_write_open( "$dir/$entry->[ chrom ]" );
371 $file_hash{ $entry->[ chrom ] } = "$dir/$entry->[ chrom ]";
374 $fh_out = $fh_hash{ $entry->[ chrom ] };
376 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $cols );
379 map { close $_ } keys %fh_hash;
381 return wantarray ? %file_hash : \%file_hash;
385 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BIOPIECES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
390 # Martin A. Hansen, November 2008.
392 # Converts a BED entry given as an arrayref
393 # to a Biopiece record which is returned as
396 # IMPORTANT! The BED_END and THICK_END positions
397 # will be the exact position in contrary to the
400 my ( $bed_entry, # BED entry as arrayref
405 my ( $cols, %bp_record );
407 $cols = scalar @{ $bed_entry };
409 if ( not defined $bed_entry->[ chrom ] and
410 not defined $bed_entry->[ chromStart ] and
411 not defined $bed_entry->[ chromEnd ] )
416 $bp_record{ "REC_TYPE" } = "BED";
417 $bp_record{ "BED_COLS" } = $cols;
418 $bp_record{ "CHR" } = $bed_entry->[ chrom ];
419 $bp_record{ "CHR_BEG" } = $bed_entry->[ chromStart ];
420 $bp_record{ "CHR_END" } = $bed_entry->[ chromEnd ] - 1;
421 $bp_record{ "BED_LEN" } = $bed_entry->[ chromEnd ] - $bed_entry->[ chromEnd ];
423 return wantarray ? %bp_record : \%bp_record if $cols == 3;
425 $bp_record{ "Q_ID" } = $bed_entry->[ name ];
427 return wantarray ? %bp_record : \%bp_record if $cols == 4;
429 $bp_record{ "SCORE" } = $bed_entry->[ score ];
431 return wantarray ? %bp_record : \%bp_record if $cols == 5;
433 $bp_record{ "STRAND" } = $bed_entry->[ strand ];
435 return wantarray ? %bp_record : \%bp_record if $cols == 6;
437 $bp_record{ "THICK_BEG" } = $bed_entry->[ thickStart ];
438 $bp_record{ "THICK_END" } = $bed_entry->[ thickEnd ] - 1;
440 return wantarray ? %bp_record : \%bp_record if $cols == 8;
442 $bp_record{ "COLOR" } = $bed_entry->[ itemRgb ];
444 return wantarray ? %bp_record : \%bp_record if $cols == 9;
446 $bp_record{ "BLOCK_COUNT" } = $bed_entry->[ blockCount ];
447 $bp_record{ "BLOCK_LENS" } = $bed_entry->[ blockSizes ];
448 $bp_record{ "Q_BEGS" } = $bed_entry->[ blockStarts ];
450 return wantarray ? %bp_record : \%bp_record;
456 # Martin A. Hansen, November 2008.
458 # Converts a Biopieces record given as a hashref
459 # to a BED record which is returned as
460 # an arrayref. As much as possible of the Biopiece
461 # record is converted and undef is returned if
464 # IMPORTANT! The BED_END and THICK_END positions
465 # will be the inexact position used in the
468 my ( $bp_record, # hashref
469 $cols, # number of columns in BED entry - OPTIONAL (but faster)
476 $cols ||= 12; # max number of columns possible
478 $bed_entry[ chrom ] = $bp_record->{ "CHR" } ||
479 $bp_record->{ "S_ID" } ||
482 $bed_entry[ chromStart ] = $bp_record->{ "CHR_BEG" } ||
483 $bp_record->{ "S_BEG" } ||
486 $bed_entry[ chromEnd ] = $bp_record->{ "CHR_END" } ||
487 $bp_record->{ "S_END" } ||
490 $bed_entry[ chromEnd ]++;
492 return wantarray ? @bed_entry : \@bed_entry if $cols == 3;
494 $bed_entry[ name ] = $bp_record->{ "Q_ID" } || return wantarray ? @bed_entry : \@bed_entry;
496 return wantarray ? @bed_entry : \@bed_entry if $cols == 4;
498 if ( exists $bp_record->{ "SCORE" } ) {
499 $bed_entry[ score ] = $bp_record->{ "SCORE" };
501 return wantarray ? @bed_entry : \@bed_entry;
504 return wantarray ? @bed_entry : \@bed_entry if $cols == 5;
506 if ( exists $bp_record->{ "STRAND" } ) {
507 $bed_entry[ strand ] = $bp_record->{ "STRAND" };
509 return wantarray ? @bed_entry : \@bed_entry;
512 return wantarray ? @bed_entry : \@bed_entry if $cols == 6;
514 if ( defined $bp_record->{ "THICK_BEG" } and
515 defined $bp_record->{ "THICK_END" } )
517 $bed_entry[ thickStart ] = $bp_record->{ "THICK_BEG" };
518 $bed_entry[ thickEnd ] = $bp_record->{ "THICK_END" };
521 return wantarray ? @bed_entry : \@bed_entry if $cols == 8;
523 if ( exists $bp_record->{ "COLOR" } ) {
524 $bed_entry[ itemRgb ] = $bp_record->{ "COLOR" };
526 return wantarray ? @bed_entry : \@bed_entry;
529 return wantarray ? @bed_entry : \@bed_entry if $cols == 9;
531 if ( defined $bp_record->{ "BLOCK_COUNT" } and
532 defined $bp_record->{ "BLOCK_LENS" } and
533 defined $bp_record->{ "Q_BEGS" } )
535 $bed_entry[ blockCount ] = $bp_record->{ "BLOCK_COUNT" };
536 $bed_entry[ blockSizes ] = $bp_record->{ "BLOCK_LENS" };
537 $bed_entry[ blockStarts ] = $bp_record->{ "Q_BEGS" };
538 $bed_entry[ thickEnd ]++;
541 return wantarray ? @bed_entry : \@bed_entry;
545 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> TAG CONTIGS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
548 sub tag_contigs_assemble
550 # Martin A. Hansen, November 2008.
552 # Returns a path to a BED file with tab contigs.
554 my ( $bed_file, # path to BED file
557 $dir, # working directory
562 my ( $fh_in, $fh_out, $array, $new_file, $pos, $id, $beg, $end, $score, $entry );
564 $fh_in = Maasha::Filesys::file_read_open( $bed_file );
566 $array = tag_contigs_assemble_array( $fh_in );
570 $new_file = "$bed_file.tag_contigs";
572 $fh_out = Maasha::Filesys::file_write_open( $new_file );
577 while ( ( $beg, $end, $score ) = tag_contigs_scan( $array, $pos ) and $beg )
579 $entry->[ chrom ] = $chr;
580 $entry->[ chromStart ] = $beg;
581 $entry->[ chromEnd ] = $end;
582 $entry->[ name ] = sprintf( "TC%06d", $id );
583 $entry->[ score ] = $score;
584 $entry->[ strand ] = $strand;
586 bed_entry_put( $entry, $fh_out );
598 sub tag_contigs_assemble_array
600 # Martin A. Hansen, November 2008.
602 # Given a BED file with entries from only one
603 # chromosome assembles tag contigs from these
604 # ignoring strand information. Only tags with
605 # a score higher than the clone count over
606 # genomic loci (the score field) is included
607 # in the tag contigs.
613 # ======================== tag contig
616 my ( $fh, # file handle to BED file
619 # Returns an arrayref.
621 my ( $entry, $clones, $score, @array );
623 while ( $entry = bed_entry_get( $fh ) )
625 if ( $entry->[ name ] =~ /_(\d+)$/ )
629 if ( $entry->[ score ] )
631 $score = int( $clones / $entry->[ score ] );
633 map { $array[ $_ ] += $score } $entry->[ chromStart ] .. $entry->[ chromEnd ] - 1 if $score >= 1;
638 return wantarray ? @array : \@array;
644 # Martin A. Hansen, November 2008.
646 # Scans an array with tag contigs and locates
647 # the next contig from a given position. The
648 # score of the tag contig is determined as the
649 # maximum value of the tag contig. If a tag contig
650 # is found a triple is returned with beg, end and score
651 # otherwise an empty list is returned.
653 my ( $array, # array to scan
654 $beg, # position to start scanning from
657 # Returns an arrayref.
663 while ( $beg < scalar @{ $array } and not $array->[ $beg ] ) { $beg++ }
667 while ( $array->[ $end ] )
669 $score = Maasha::Calc::max( $score, $array->[ $end ] );
675 return wantarray ? ( $beg, $end, $score ) : [ $beg, $end, $score ];
677 return wantarray ? () : [];
682 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<