1 package Maasha::UCSC::BED;
3 # Copyright (C) 2007-2008 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Routines for interaction with Browser Extensible DATA (BED) entries and files.
27 # Read about the BED format here: http://genome.ucsc.edu/FAQ/FAQformat#format1
30 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
41 use vars qw( @ISA @EXPORT_OK );
45 @ISA = qw( Exporter );
51 chrom => 0, # BED field names
63 CHR => 0, # Biopieces field names
78 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BED format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
81 # Hash for converting BED keys to Biopieces keys.
85 chromStart => "CHR_BEG",
86 chromEnd => "CHR_END",
90 thickStart => "THICK_BEG",
91 thickEnd => "THICK_END",
93 blockCount => "BLOCK_COUNT",
94 blockSizes => "BLOCK_LENS",
95 blockStarts => "Q_BEGS",
99 # Hash for converting biopieces keys to BED keys.
101 my %BIOPIECES2BED = (
103 CHR_BEG => "chromStart",
104 CHR_END => "chromEnd",
108 THICK_BEG => "thickStart",
109 THICK_END => "thickEnd",
111 BLOCK_COUNT => "blockCount",
112 BLOCK_LENS => "blockSizes",
113 Q_BEGS => "blockStarts",
119 # Martin A. Hansen, September 2008.
121 # Reads a BED entry given a filehandle.
123 my ( $fh, # file handle
124 $cols, # columns to read - OPTIONAL (3,4,5,6,8,9 or 12)
125 $check, # check integrity of BED values - OPTIONAL
130 my ( $line, @entry );
134 return if not defined $line;
136 $line =~ tr/\n\r//d; # some people have carriage returns in their BED files -> Grrrr
138 if ( not defined $cols ) {
139 $cols = 1 + $line =~ tr/\t//;
142 @entry = split "\t", $line, $cols + 1;
144 pop @entry if scalar @entry > $cols;
146 bed_entry_check( \@entry ) if $check;
148 return wantarray ? @entry : \@entry;
154 # Martin A. Hansen, September 2008.
156 # Writes a BED entry array to file.
159 $fh, # file handle - OPTIONAL
160 $cols, # number of columns in BED file - OPTIONAL (3,4,5,6,8,9 or 12)
161 $check, # check integrity of BED values - OPTIONAL
166 if ( $cols and $cols < scalar @{ $entry } ) {
167 @{ $entry } = @{ $entry }[ 0 .. $cols - 1 ];
170 bed_entry_check( $entry ) if $check;
172 $fh = \*STDOUT if not defined $fh;
174 print $fh join( "\t", @{ $entry } ), "\n";
181 # Martin A. Hansen, November 2008.
183 # Checks a BED entry for integrity and
184 # raises an error if there is a problem.
186 my ( $bed, # array ref
191 my ( $entry, $cols, @block_sizes, @block_starts );
193 $entry = join "\t", @{ $bed };
194 $cols = scalar @{ $bed };
197 Maasha::Common::error( qq(Bad BED entry "$entry" - must contain at least 3 columns - not $cols) );
201 Maasha::Common::error( qq(Bad BED entry "$entry" - must contains no more than 12 columns - not $cols) );
204 if ( $bed->[ chrom ] =~ /\s/ ) {
205 Maasha::Common::error( qq(Bad BED entry "$entry" - no white space allowed in chrom field: "$bed->[ chrom ]") );
208 if ( $bed->[ chromStart ] =~ /\D/ ) {
209 Maasha::Common::error( qq(Bad BED entry "$entry" - chromStart must be a whole number - not "$bed->[ chromStart ]") );
212 if ( $bed->[ chromEnd ] =~ /\D/ ) {
213 Maasha::Common::error( qq(Bad BED entry "$entry" - chromEnd must be a whole number - not "$bed->[ chromEnd ]") );
216 if ( $bed->[ chromEnd ] < $bed->[ chromStart ] ) {
217 Maasha::Common::error( qq(Bad BED entry "$entry" - chromEnd must be greater than chromStart - not "$bed->[ chromStart ] > $bed->[ chromEnd ]") );
220 return if $cols == 3;
222 if ( $bed->[ name ] =~ /\s/ ) {
223 Maasha::Common::error( qq(Bad BED entry "$entry" - no white space allowed in name field: "$bed->[ name ]") );
226 return if $cols == 4;
228 if ( $bed->[ score ] =~ /\D/ ) {
229 Maasha::Common::error( qq(Bad BED entry "$entry" - score must be a whole number - not "$bed->[ score ]") );
232 # if ( $bed->[ score ] < 0 or $bed->[ score ] > 1000 ) { # disabled - too restrictive !
233 if ( $bed->[ score ] < 0 ) {
234 Maasha::Common::error( qq(Bad BED entry "$entry" - score must be between 0 and 1000 - not "$bed->[ score ]") );
237 return if $cols == 5;
239 if ( $bed->[ strand ] ne '+' and $bed->[ strand ] ne '-' ) {
240 Maasha::Common::error( qq(Bad BED entry "$entry" - strand must be + or - not "$bed->[ strand ]") );
243 return if $cols == 6;
245 if ( $bed->[ thickStart ] =~ /\D/ ) {
246 Maasha::Common::error( qq(Bad BED entry "$entry" - thickStart must be a whole number - not "$bed->[ thickStart ]") );
249 if ( $bed->[ thickEnd ] =~ /\D/ ) {
250 Maasha::Common::error( qq(Bad BED entry "$entry" - thickEnd must be a whole number - not "$bed->[ thickEnd ]") );
253 if ( $bed->[ thickEnd ] < $bed->[ thickStart ] ) {
254 Maasha::Common::error( qq(Bad BED entry "$entry" - thickEnd must be greater than thickStart - not "$bed->[ thickStart ] > $bed->[ thickEnd ]") );
257 if ( $bed->[ thickStart ] < $bed->[ chromStart ] ) {
258 Maasha::Common::error( qq(Bad BED entry "$entry" - thickStart must be greater than chromStart - not "$bed->[ thickStart ] < $bed->[ chromStart ]") );
261 if ( $bed->[ thickStart ] > $bed->[ chromEnd ] ) {
262 Maasha::Common::error( qq(Bad BED entry "$entry" - thickStart must be less than chromEnd - not "$bed->[ thickStart ] > $bed->[ chromEnd ]") );
265 if ( $bed->[ thickEnd ] < $bed->[ chromStart ] ) {
266 Maasha::Common::error( qq(Bad BED entry "$entry" - thickEnd must be greater than chromStart - not "$bed->[ thickEnd ] < $bed->[ chromStart ]") );
269 if ( $bed->[ thickEnd ] > $bed->[ chromEnd ] ) {
270 Maasha::Common::error( qq(Bad BED entry "$entry" - thickEnd must be less than chromEnd - not "$bed->[ thickEnd ] > $bed->[ chromEnd ]") );
273 return if $cols == 8;
275 if ( $bed->[ itemRgb ] !~ /^(0|\d{1,3},\d{1,3},\d{1,3},?)$/ ) {
276 Maasha::Common::error( qq(Bad BED entry "$entry" - itemRgb must be 0 or in the form of 255,0,0 - not "$bed->[ itemRgb ]") );
279 return if $cols == 9;
281 if ( $bed->[ blockCount ] =~ /\D/ ) {
282 Maasha::Common::error( qq(Bad BED entry "$entry" - blockCount must be a whole number - not "$bed->[ blockCount ]") );
285 @block_sizes = split ",", $bed->[ blockSizes ];
287 if ( grep /\D/, @block_sizes ) {
288 Maasha::Common::error( qq(Bad BED entry "$entry" - blockSizes must be whole numbers - not "$bed->[ blockSizes ]") );
291 if ( $bed->[ blockCount ] != scalar @block_sizes ) {
292 Maasha::Common::error( qq(Bad BED entry "$entry" - blockSizes "$bed->[ blockSizes ]" must match blockCount "$bed->[ blockCount ]") );
295 @block_starts = split ",", $bed->[ blockStarts ];
297 if ( grep /\D/, @block_starts ) {
298 Maasha::Common::error( qq(Bad BED entry "$entry" - blockStarts must be whole numbers - not "$bed->[ blockStarts ]") );
301 if ( $bed->[ blockCount ] != scalar @block_starts ) {
302 Maasha::Common::error( qq(Bad BED entry "$entry" - blockStarts "$bed->[ blockStarts ]" must match blockCount "$bed->[ blockCount ]") );
305 if ( $bed->[ chromStart ] + $block_starts[ -1 ] + $block_sizes[ -1 ] != $bed->[ chromEnd ] ) {
306 Maasha::Common::error( qq(Bad BED entry "$entry" - chromStart + blockStarts[last] + blockSizes[last] must equal chromEnd: ) .
307 qq($bed->[ chromStart ] + $block_starts[ -1 ] + $block_sizes[ -1 ] != $bed->[ chromEnd ]) );
314 # Martin A. hansen, November 2008.
316 # Sorts a given BED file according to a given
318 # 1: chr AND chr_beg.
319 # 2: chr AND strand AND chr_beg.
321 # 4: strand AND chr_beg.
323 # Currently 'bed_sort' is used for sorting = a standalone c program
324 # that uses qsort (unstable sort).
326 my ( $file_in, # path to BED file.
327 $scheme, # sort scheme
328 $cols, # number of columns in BED file
335 $file_out = "$file_in.sort";
337 Maasha::Common::run( "bed_sort", "--sort $scheme --cols $cols $file_in > $file_out" );
339 if ( Maasha::Filesys::file_size( $file_in ) != Maasha::Filesys::file_size( $file_out ) ) {
340 Maasha::Common::error( qq(bed_sort of file "$file_in" failed) );
343 rename $file_out, $file_in;
347 sub bed_file_split_on_chr
349 # Martin A. Hansen, November 2008.
351 # Given a path to a BED file splits
352 # this file into one file per chromosome
353 # in a temporary directory. Returns a hash
354 # with chromosome name as key and the corresponding
357 my ( $file, # path to BED file
358 $dir, # working directory
359 $cols, # number of BED columns to read - OPTIONAL
364 my ( $fh_in, $fh_out, $entry, %fh_hash, %file_hash );
366 $fh_in = Maasha::Filesys::file_read_open( $file );
368 while ( $entry = bed_entry_get( $fh_in, $cols ) )
370 if ( not exists $file_hash{ $entry->[ chrom ] } )
372 $fh_hash{ $entry->[ chrom ] } = Maasha::Filesys::file_write_open( "$dir/$entry->[ chrom ]" );
373 $file_hash{ $entry->[ chrom ] } = "$dir/$entry->[ chrom ]";
376 $fh_out = $fh_hash{ $entry->[ chrom ] };
378 Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $cols );
381 map { close $_ } keys %fh_hash;
383 return wantarray ? %file_hash : \%file_hash;
387 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BIOPIECES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
392 # Martin A. Hansen, November 2008.
394 # Converts a BED entry given as an arrayref
395 # to a Biopiece record which is returned as
398 # IMPORTANT! The BED_END and THICK_END positions
399 # will be the exact position in contrary to the
402 my ( $bed_entry, # BED entry as arrayref
407 my ( $cols, %bp_record );
409 $cols = scalar @{ $bed_entry };
411 if ( not defined $bed_entry->[ chrom ] and
412 not defined $bed_entry->[ chromStart ] and
413 not defined $bed_entry->[ chromEnd ] )
418 $bp_record{ "REC_TYPE" } = "BED";
419 $bp_record{ "BED_COLS" } = $cols;
420 $bp_record{ "CHR" } = $bed_entry->[ chrom ];
421 $bp_record{ "CHR_BEG" } = $bed_entry->[ chromStart ];
422 $bp_record{ "CHR_END" } = $bed_entry->[ chromEnd ] - 1;
423 $bp_record{ "BED_LEN" } = $bed_entry->[ chromEnd ] - $bed_entry->[ chromStart ];
425 return wantarray ? %bp_record : \%bp_record if $cols == 3;
427 $bp_record{ "Q_ID" } = $bed_entry->[ name ];
429 return wantarray ? %bp_record : \%bp_record if $cols == 4;
431 $bp_record{ "SCORE" } = $bed_entry->[ score ];
433 return wantarray ? %bp_record : \%bp_record if $cols == 5;
435 $bp_record{ "STRAND" } = $bed_entry->[ strand ];
437 return wantarray ? %bp_record : \%bp_record if $cols == 6;
439 $bp_record{ "THICK_BEG" } = $bed_entry->[ thickStart ];
440 $bp_record{ "THICK_END" } = $bed_entry->[ thickEnd ] - 1;
442 return wantarray ? %bp_record : \%bp_record if $cols == 8;
444 $bp_record{ "COLOR" } = $bed_entry->[ itemRgb ];
446 return wantarray ? %bp_record : \%bp_record if $cols == 9;
448 $bp_record{ "BLOCK_COUNT" } = $bed_entry->[ blockCount ];
449 $bp_record{ "BLOCK_LENS" } = $bed_entry->[ blockSizes ];
450 $bp_record{ "Q_BEGS" } = $bed_entry->[ blockStarts ];
452 return wantarray ? %bp_record : \%bp_record;
458 # Martin A. Hansen, November 2008.
460 # Converts a Biopieces record given as a hashref
461 # to a BED record which is returned as
462 # an arrayref. As much as possible of the Biopiece
463 # record is converted and undef is returned if
466 # IMPORTANT! The chromEnd and thickEnd positions
467 # will be the inexact position used in the
468 # UCSC scheme (+1 to all ends).
470 my ( $bp_record, # hashref
471 $cols, # number of columns in BED entry - OPTIONAL (but faster)
476 my ( @bed_entry, @begs );
478 $cols ||= 12; # max number of columns possible
480 $bed_entry[ chrom ] = $bp_record->{ "CHR" } ||
481 $bp_record->{ "S_ID" } ||
484 if ( defined $bp_record->{ "CHR_BEG" } ) {
485 $bed_entry[ chromStart ] = $bp_record->{ "CHR_BEG" };
486 } elsif ( defined $bp_record->{ "S_BEG" } ) {
487 $bed_entry[ chromStart ] = $bp_record->{ "S_BEG" };
492 if ( defined $bp_record->{ "CHR_END" } ) {
493 $bed_entry[ chromEnd ] = $bp_record->{ "CHR_END" } + 1;
494 } elsif ( defined $bp_record->{ "S_END" }) {
495 $bed_entry[ chromEnd ] = $bp_record->{ "S_END" } + 1;
500 return wantarray ? @bed_entry : \@bed_entry if $cols == 3;
502 $bed_entry[ name ] = $bp_record->{ "Q_ID" } || return wantarray ? @bed_entry : \@bed_entry;
504 return wantarray ? @bed_entry : \@bed_entry if $cols == 4;
506 if ( exists $bp_record->{ "SCORE" } ) {
507 $bed_entry[ score ] = $bp_record->{ "SCORE" };
509 return wantarray ? @bed_entry : \@bed_entry;
512 return wantarray ? @bed_entry : \@bed_entry if $cols == 5;
514 if ( exists $bp_record->{ "STRAND" } ) {
515 $bed_entry[ strand ] = $bp_record->{ "STRAND" };
517 return wantarray ? @bed_entry : \@bed_entry;
520 return wantarray ? @bed_entry : \@bed_entry if $cols == 6;
522 if ( defined $bp_record->{ "THICK_BEG" } and
523 defined $bp_record->{ "THICK_END" } )
525 $bed_entry[ thickStart ] = $bp_record->{ "THICK_BEG" };
526 $bed_entry[ thickEnd ] = $bp_record->{ "THICK_END" } + 1;
528 elsif ( defined $bp_record->{ "BLOCK_COUNT" } )
530 $bed_entry[ thickStart ] = $bed_entry[ chromStart ];
531 $bed_entry[ thickEnd ] = $bed_entry[ chromEnd ];
534 return wantarray ? @bed_entry : \@bed_entry if $cols == 8;
536 if ( defined $bp_record->{ "COLOR" } )
538 $bed_entry[ itemRgb ] = $bp_record->{ "COLOR" };
540 elsif ( defined $bp_record->{ "BLOCK_COUNT" } )
542 $bed_entry[ itemRgb ] = 0;
545 return wantarray ? @bed_entry : \@bed_entry if $cols == 9;
547 if ( defined $bp_record->{ "BLOCK_COUNT" } and
548 defined $bp_record->{ "BLOCK_LENS" } and
549 defined $bp_record->{ "S_BEGS" } )
551 @begs = split ",", $bp_record->{ "S_BEGS" };
552 map { $_ -= $bed_entry[ chromStart ] } @begs;
554 $bed_entry[ blockCount ] = $bp_record->{ "BLOCK_COUNT" };
555 $bed_entry[ blockSizes ] = $bp_record->{ "BLOCK_LENS" };
556 $bed_entry[ blockStarts ] = join ",", @begs;
557 # $bed_entry[ thickEnd ] = $bp_record->{ "THICK_END" } + 1;
559 elsif ( defined $bp_record->{ "BLOCK_COUNT" } and
560 defined $bp_record->{ "BLOCK_LENS" } and
561 defined $bp_record->{ "Q_BEGS" } )
563 $bed_entry[ blockCount ] = $bp_record->{ "BLOCK_COUNT" };
564 $bed_entry[ blockSizes ] = $bp_record->{ "BLOCK_LENS" };
565 $bed_entry[ blockStarts ] = $bp_record->{ "Q_BEGS" };
566 # $bed_entry[ thickEnd ] = $bp_record->{ "THICK_END" } + 1;
569 return wantarray ? @bed_entry : \@bed_entry;
573 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> TAG CONTIGS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
576 sub tag_contigs_assemble
578 # Martin A. Hansen, November 2008.
580 # Returns a path to a BED file with tab contigs.
582 my ( $bed_file, # path to BED file
585 $dir, # working directory
590 my ( $fh_in, $fh_out, $array, $new_file, $pos, $id, $beg, $end, $score, $entry );
592 $fh_in = Maasha::Filesys::file_read_open( $bed_file );
594 $array = tag_contigs_assemble_array( $fh_in );
598 $new_file = "$bed_file.tag_contigs";
600 $fh_out = Maasha::Filesys::file_write_open( $new_file );
605 while ( ( $beg, $end, $score ) = tag_contigs_scan( $array, $pos ) and $beg )
607 $entry->[ chrom ] = $chr;
608 $entry->[ chromStart ] = $beg;
609 $entry->[ chromEnd ] = $end;
610 $entry->[ name ] = sprintf( "TC%06d", $id );
611 $entry->[ score ] = $score;
612 $entry->[ strand ] = $strand;
614 bed_entry_put( $entry, $fh_out );
626 sub tag_contigs_assemble_array
628 # Martin A. Hansen, November 2008.
630 # Given a BED file with entries from only one
631 # chromosome assembles tag contigs from these
632 # ignoring strand information. Only tags with
633 # a score higher than the clone count over
634 # genomic loci (the score field) is included
635 # in the tag contigs.
641 # ======================== tag contig
644 my ( $fh, # file handle to BED file
647 # Returns an arrayref.
649 my ( $entry, $clones, $score, @array );
651 while ( $entry = bed_entry_get( $fh ) )
653 if ( $entry->[ name ] =~ /_(\d+)$/ )
657 if ( $entry->[ score ] )
659 $score = int( $clones / $entry->[ score ] );
661 map { $array[ $_ ] += $score } $entry->[ chromStart ] .. $entry->[ chromEnd ] - 1 if $score >= 1;
666 return wantarray ? @array : \@array;
672 # Martin A. Hansen, November 2008.
674 # Scans an array with tag contigs and locates
675 # the next contig from a given position. The
676 # score of the tag contig is determined as the
677 # maximum value of the tag contig. If a tag contig
678 # is found a triple is returned with beg, end and score
679 # otherwise an empty list is returned.
681 my ( $array, # array to scan
682 $beg, # position to start scanning from
685 # Returns an arrayref.
691 while ( $beg < scalar @{ $array } and not $array->[ $beg ] ) { $beg++ }
695 while ( $array->[ $end ] )
697 $score = Maasha::Calc::max( $score, $array->[ $end ] );
703 return wantarray ? ( $beg, $end, $score ) : [ $beg, $end, $score ];
705 return wantarray ? () : [];
710 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<