CHR_END => 2,
INDEX_BEG => 3,
INDEX_LEN => 4,
+
+ CHR => 0,
+ CHR_BEG => 1,
+ CHR_END => 2,
+ Q_ID => 3,
+ SCORE => 4,
+ STRAND => 5,
+ THICK_BEG => 6,
+ THICK_END => 7,
+ ITEMRGB => 8,
+ BLOCKCOUNT => 9,
+ BLOCKSIZES => 10,
+ Q_BEGS => 11,
};
@ISA = qw( Exporter );
# http://genome.ucsc.edu/goldenPath/help/hgTracksHelp.html#BED
+sub bed_entry_get_array
+{
+ # Martin A. Hansen, September 2008.
+
+ # Reads a BED entry given a filehandle.
+
+ # This is a new _faster_ BED entry parser that
+ # uses arrays and not hashrefs.
+
+ # IMPORTANT! This function does not correct the
+ # BED_END position that is kept the way UCSC
+ # does.
+
+ my ( $fh, # file handle
+ $cols, # columns to read - OPTIONAL (3,4,5,6, or 12)
+ ) = @_;
+
+ # Returns a list.
+
+ my ( $line, @entry );
+
+ $line = <$fh>;
+
+ $line =~ tr/\n\r//d; # some people have carriage returns in their BED files -> Grrrr
+
+ return if not defined $line;
+
+ if ( not defined $cols ) {
+ $cols = 1 + $line =~ tr/\t//;
+ }
+
+ @entry = split "\t", $line, $cols + 1;
+
+ pop @entry if scalar @entry > $cols;
+
+ return wantarray ? @entry : \@entry;
+}
+
+
sub bed_get_entry
{
# Martin A. Hansen, December 2007.
}
+sub bed_entry_put_array
+{
+ # Martin A. Hansen, Septermber 2008.
+
+ # Writes a BED entry array to file.
+
+ # IMPORTANT! This function does not correct the
+ # BED_END position that is assumed to be in the
+ # UCSC positions scheme.
+
+ my ( $record, # list
+ $fh, # file handle - OPTIONAL
+ $cols, # number of columns in BED file - OPTIONAL
+ ) = @_;
+
+ # Returns nothing.
+
+ $fh = \*STDOUT if not defined $fh;
+
+ if ( defined $cols ) {
+ print $fh join( "\t", @{ $record }[ 0 .. $cols - 1 ] ), "\n";
+ } else {
+ print $fh join( "\t", @{ $record } ), "\n";
+ }
+}
+
+
sub bed_put_entry
{
# Martin A. Hansen, Septermber 2007.
sub bed_sort
{
- # Martin A. Hansen, March 2008.
+ # Martin A. Hansen, Septermber 2008
- # Sort a potential huge BED file according to
- # CHR, CHR_BEG and optionally STRAND.
+ # Sorts a BED file using the c program
+ # "bed_sort" specifing a sort mode:
- my ( $tmp_dir, # temporary directory used for sorting
- $file, # BED file to sort
- $strand, # flag to sort on strand - OPTIONAL
- ) = @_;
+ # 1: chr AND chr_beg.
+ # 2: chr AND strand AND chr_beg.
+ # 3: chr_beg.
+ # 4: strand AND chr_beg.
- # Returns nothing.
+ my ( $bed_file, # BED file to sort
+ $sort_mode, # See above.
+ $cols, # Number of columns in BED file
+ ) = @_;
- my ( $fh_in, $key, $fh_out, %fh_hash, $part_file, $entry, $entries );
+ &Maasha::Common::run( "bed_sort", "--sort $sort_mode --cols $cols $bed_file" );
+}
- $fh_in = Maasha::Common::read_open( $file );
- while ( $entry = bed_get_entry( $fh_in ) )
- {
- if ( $strand ) {
- $key = join "_", $entry->{ "CHR" }, $entry->{ "STRAND" };
- } else {
- $key = $entry->{ "CHR" };
- }
+sub bed_split_to_files
+{
+ # Martin A. Hansen, Septermber 2008
- $fh_hash{ $key } = Maasha::Common::write_open( "$tmp_dir/$key.sort" ) if not exists $fh_hash{ $key };
-
- bed_put_entry( $entry, $fh_hash{ $key } );
- }
+ # Given a list of BED files, split these
+ # into temporary files based on the chromosome
+ # name. Returns a list of the temporary files.
- close $fh_in;
+ my ( $bed_files, # list of BED files to split
+ $cols, # number of columns
+ $tmp_dir, # temporary directory
+ ) = @_;
- map { close $_ } keys %fh_hash;
+ # Returns a list.
- $fh_out = Maasha::Common::write_open( "$tmp_dir/temp.sort" );
+ my ( $bed_file, $fh_in, $entry, $key, %fh_hash, @tmp_files );
- foreach $part_file ( sort keys %fh_hash )
+ foreach $bed_file ( @{ $bed_files } )
{
- $entries = bed_get_entries( "$tmp_dir/$part_file.sort" );
+ $fh_in = Maasha::Common::read_open( $bed_file );
- @{ $entries } = sort { $a->{ "CHR_BEG" } <=> $b->{ "CHR_BEG" } } @{ $entries };
-
- map { bed_put_entry( $_, $fh_out ) } @{ $entries };
+ while ( $entry = bed_entry_get_array( $fh_in, $cols ) )
+ {
+ $key = $entry->[ CHR ];
+
+ $fh_hash{ $key } = Maasha::Common::write_open( "$tmp_dir/$key.temp" ) if not exists $fh_hash{ $key };
+
+ bed_entry_put_array( $entry, $fh_hash{ $key } );
+ }
- unlink "$tmp_dir/$part_file.sort";
+ close $fh_in;
}
- close $fh_out;
+ foreach $key ( sort keys %fh_hash )
+ {
+ push @tmp_files, "$tmp_dir/$key.temp";
+
+ close $fh_hash{ $key };
+ }
- rename "$tmp_dir/temp.sort", $file;
+ return wantarray ? @tmp_files : \@tmp_files;
}
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> PSL format <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+sub psl_get_entry
+{
+ # Martin A. Hansen, August 2008.
+
+ # Reads PSL next entry from a PSL file and returns a record.
+
+ my ( $fh, # file handle of PSL filefull path to PSL file
+ ) = @_;
+
+ # Returns hashref.
+
+ my ( $line, @fields, %record );
+
+ while ( $line = <$fh> )
+ {
+ chomp $line;
+
+ @fields = split "\t", $line;
+
+ if ( scalar @fields == 21 )
+ {
+ %record = (
+ REC_TYPE => "PSL",
+ MATCHES => $fields[ 0 ],
+ MISMATCHES => $fields[ 1 ],
+ REPMATCHES => $fields[ 2 ],
+ NCOUNT => $fields[ 3 ],
+ QNUMINSERT => $fields[ 4 ],
+ QBASEINSERT => $fields[ 5 ],
+ SNUMINSERT => $fields[ 6 ],
+ SBASEINSERT => $fields[ 7 ],
+ STRAND => $fields[ 8 ],
+ Q_ID => $fields[ 9 ],
+ Q_LEN => $fields[ 10 ],
+ Q_BEG => $fields[ 11 ],
+ Q_END => $fields[ 12 ] - 1,
+ S_ID => $fields[ 13 ],
+ S_LEN => $fields[ 14 ],
+ S_BEG => $fields[ 15 ],
+ S_END => $fields[ 16 ] - 1,
+ BLOCKCOUNT => $fields[ 17 ],
+ BLOCKSIZES => $fields[ 18 ],
+ Q_BEGS => $fields[ 19 ],
+ S_BEGS => $fields[ 20 ],
+ );
+
+ $record{ "SCORE" } = $record{ "MATCHES" } + int( $record{ "REPMATCHES" } / 2 ) - $record{ "MISMATCHES" } - $record{ "QNUMINSERT" } - $record{ "SNUMINSERT" };
+ $record{ "SPAN" } = $fields[ 12 ] - $fields[ 11 ];
+
+ return wantarray ? %record : \%record;
+ }
+ }
+
+ return undef;
+}
+
+
sub psl_get_entries
{
# Martin A. Hansen, February 2008.
- # Reads PSL entries and returns a record.
+ # Reads PSL entries and returns a list of records.
my ( $path, # full path to PSL file
) = @_;
}
-sub fixedStep_index_lookup
+sub fixedstep_index_lookup
{
# Martin A. Hansen, January 2008.