From: martinahansen Date: Thu, 17 Jun 2010 11:36:00 +0000 (+0000) Subject: added fixedstep biopieces X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=9b9fbedec855fca622626c4bf479b40b8492172f;p=biopieces.git added fixedstep biopieces git-svn-id: http://biopieces.googlecode.com/svn/trunk@987 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/bp_bin/create_fixedstep_index b/bp_bin/create_fixedstep_index new file mode 100755 index 0000000..0eb2086 --- /dev/null +++ b/bp_bin/create_fixedstep_index @@ -0,0 +1,95 @@ +#!/usr/bin/env perl + +# Copyright (C) 2007-2010 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Create a fixedstep index from fixedstep entries in the stream for use with [get_fixedstep]. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use warnings; +use strict; +use Maasha::Common; +use Maasha::Biopieces; +use Maasha::UCSC::Wiggle; +use Maasha::Filesys; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $default, $formats, $options, $in, $out, $record, $tmp_dir, $file_out, $fh_out, $entry ); + +$options = Maasha::Biopieces::parse_options( + [ + { long => 'no_stream', short => 'x', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'directory', short => 'd', type => 'dir', mandatory => 'yes', default => undef, allowed => undef, disallowed => undef }, + { long => 'index_name', short => 'i', type => 'string', mandatory => 'yes', default => undef, allowed => undef, disallowed => undef }, + ] +); + +Maasha::Common::error( qq(Directory already exists: "$options->{ 'directory' }") ) if -d $options->{ 'directory' }; + +Maasha::Filesys::dir_create_if_not_exists( $options->{ 'directory' } ); + +$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); +$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); + +$file_out = "$options->{ 'directory' }/$options->{ 'index_name' }.wig"; +$fh_out = Maasha::Filesys::file_write_open( $file_out ); + +while ( $record = Maasha::Biopieces::get_record( $in ) ) +{ + if ( $entry = Maasha::UCSC::Wiggle::biopiece2fixedstep( $record ) ) { + Maasha::UCSC::Wiggle::fixedstep_entry_put( $entry, $fh_out ); + } + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; +} + +close $fh_out; + +Maasha::UCSC::Wiggle::fixedstep_index( $file_out, $options->{ 'directory' }, $options->{ 'index_name' } . ".index" ); + +Maasha::Biopieces::close_stream( $in ); +Maasha::Biopieces::close_stream( $out ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +BEGIN +{ + Maasha::Biopieces::status_set(); +} + + +END +{ + Maasha::Biopieces::status_log(); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ diff --git a/bp_bin/get_fixedstep b/bp_bin/get_fixedstep new file mode 100755 index 0000000..9f93944 --- /dev/null +++ b/bp_bin/get_fixedstep @@ -0,0 +1,169 @@ +#!/usr/bin/env perl + +# Copyright (C) 2007-2009 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Extract fixedstep scores from an indexed fixedstep file. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use warnings; +use strict; +use Data::Dumper; +use Maasha::Biopieces; +use Maasha::Filesys; +use Maasha::Calc; +use Maasha::UCSC::Wiggle; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $options, $in, $out, $index, $fh, $record, $new_record, $subindex, $entry, $scores ); + +$options = Maasha::Biopieces::parse_options( + [ + { long => 'index', short => 'i', type => 'string', mandatory => 'yes', default => undef, allowed => undef, disallowed => undef }, + { long => 'chr', short => 'c', type => 'string', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'beg', short => 'b', type => 'uint', mandatory => 'no', default => undef, allowed => undef, disallowed => 0 }, + { long => 'end', short => 'e', type => 'uint', mandatory => 'no', default => undef, allowed => undef, disallowed => 0 }, + ] +); + +$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); +$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); + +$index = Maasha::UCSC::Wiggle::fixedstep_index_retrieve( $options->{ 'index' } . ".index" ); + +$fh = Maasha::Filesys::file_read_open( $options->{ 'index' } . ".wig" ); + +if ( $options->{ 'chr' } and exists $index->{ $options->{ 'chr' } } ) +{ + $subindex = Maasha::UCSC::Wiggle::fixedstep_index_lookup( $index, $options->{ 'chr' }, $options->{ 'beg' }, $options->{ 'end' } ); + + foreach $entry ( @{ $subindex } ) + { + $scores = get_scores( $fh, $entry->{ 'INDEX_BEG' }, $entry->{ 'INDEX_LEN' } ); + + if ( $options->{ 'beg' } > $entry->{ 'CHR_BEG' } ) { + $scores = [ @{ $scores }[ ( $options->{ 'beg' } - $entry->{ 'CHR_BEG' } ) .. scalar @{ $scores } - 1 ] ]; + } + + if ( $options->{ 'end' } < $entry->{ 'CHR_END' } ) { + $scores = [ @{ $scores }[ 0 .. ( $options->{ 'end' } - $options->{ 'beg' } ) ] ]; + } + + $new_record->{ 'REC_TYPE' } = 'fixed_step'; + $new_record->{ 'STEP' } = 1; + $new_record->{ 'CHR' } = $options->{ 'chr' }; + $new_record->{ 'CHR_BEG' } = Maasha::Calc::max( $options->{ 'beg' }, $entry->{ 'CHR_BEG' } ); + $new_record->{ 'VALS' } = join ";", @{ $scores }; + + Maasha::Biopieces::put_record( $new_record, $out ); + } +} + + +while ( $record = Maasha::Biopieces::get_record( $in ) ) +{ + if ( $record->{ 'CHR' } and $record->{ 'CHR_BEG' } and $record->{ 'CHR_END' } ) + { + if ( exists $index->{ $record->{ 'CHR' } } ) + { + $subindex = Maasha::UCSC::Wiggle::fixedstep_index_lookup( $index, $record->{ 'CHR' }, $record->{ 'CHR_BEG' }, $record->{ 'CHR_END' } ); + + foreach $entry ( @{ $subindex } ) + { + $scores = get_scores( $fh, $entry->{ 'INDEX_BEG' }, $entry->{ 'INDEX_LEN' } ); + + if ( $record->{ 'CHR_BEG' } > $entry->{ 'CHR_BEG' } ) { + $scores = [ @{ $scores }[ ( $record->{ 'CHR_BEG' } - $entry->{ 'CHR_BEG' } ) .. scalar @{ $scores } - 1 ] ]; + } + + if ( $record->{ 'CHR_END' } < $entry->{ 'CHR_END' } ) { + $scores = [ @{ $scores }[ 0 .. ( $record->{ 'CHR_END' } - $record->{ 'CHR_BEG' } ) ] ]; + } + + $new_record->{ 'REC_TYPE' } = 'fixed_step'; + $new_record->{ 'STEP' } = 1; + $new_record->{ 'CHR' } = $record->{ 'CHR' }; + $new_record->{ 'CHR_BEG' } = Maasha::Calc::max( $record->{ 'CHR_BEG' }, $entry->{ 'CHR_BEG' } ); + $new_record->{ 'VALS' } = join ";", @{ $scores }; + + Maasha::Biopieces::put_record( $new_record, $out ); + } + } + } +} + +close $fh; + +Maasha::Biopieces::close_stream( $in ); +Maasha::Biopieces::close_stream( $out ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub get_scores +{ + # Martin A. Hansen, June 2010. + + # Get scores from a fixedstep file based on index + # offset and length. + + my ( $fh, # filehandle to fixedstep file + $offset, # file offset + $len, # length + ) = @_; + + # Returns a list. + + my ( $block, @scores ); + + $block = Maasha::Filesys::file_read( $fh, $offset, $len ); + + @scores = split "\n", $block; + + return wantarray ? @scores : \@scores; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +BEGIN +{ + Maasha::Biopieces::status_set(); +} + + +END +{ + Maasha::Biopieces::status_log(); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ diff --git a/code_perl/Maasha/UCSC/Wiggle.pm b/code_perl/Maasha/UCSC/Wiggle.pm index ce52c73..77755ce 100644 --- a/code_perl/Maasha/UCSC/Wiggle.pm +++ b/code_perl/Maasha/UCSC/Wiggle.pm @@ -36,6 +36,7 @@ use strict; use Data::Dumper; use Maasha::Common; use Maasha::Filesys; +use Maasha::Matrix; use Maasha::Calc; use vars qw( @ISA @EXPORT_OK ); @@ -63,12 +64,6 @@ use constant { blockCount => 9, blockSizes => 10, blockStarts => 11, - - FS_CHR_BEG => 0, - FS_NEXT_CHR_BEG => 1, - FS_CHR_END => 2, - FS_INDEX_BEG => 3, - FS_INDEX_LEN => 4, }; @@ -326,63 +321,92 @@ sub fixedstep_scan } +sub fixedstep_index +{ + # Martin A. Hansen, June 2010. + + # Indexes a specified fixedstep file and saves the index + # to a specified directory and index name. + + my ( $path, # fixedstep file to index + $dir, # direcotory to place index + $name, # name of index + ) = @_; + + # Returns nothing. + + my ( $index ); + + $index = fixedstep_index_create( $path ); + fixedstep_index_store( "$dir/$name", $index ); +} + + sub fixedstep_index_create { - # Martin A. Hansen, January 2008. + # Martin A. Hansen, June 2010. - # Indexes a concatenated fixedStep file. + # Indexes a specified fixedStep file. # The index consists of a hash with chromosomes as keys, - # and a list of [ chr_beg, next_chr_beg, chr_end, index_beg, index_len ] as values. + # and a list of { chr_beg, chr_end, index_beg, index_len } + # hashes as values. my ( $path, # path to fixedStep file ) = @_; # Returns a hashref - my ( $fh, $pos, $index_beg, $index_len, $entry, $locator, $chr, $step, $beg, $end, $len, %index, $i ); + my ( $fh, $pos, $line, $index_beg, $index_len, $chr, $step, $chr_beg, $chr_end, $len, %index ); $fh = Maasha::Filesys::file_read_open( $path ); - $pos = 0; + $index_beg = 0; + $index_len = 0; + $chr_beg = 0; + $chr_end = 0; + $pos = 0; - while ( $entry = Maasha::UCSC::fixedstep_get_entry( $fh ) ) + while ( $line = <$fh> ) { - $locator = shift @{ $entry }; - - if ( $locator =~ /chrom=([^ ]+) start=(\d+) step=(\d+)/ ) + if ( $line =~ /^fixedStep chrom=([^ ]+) start=(\d+) step=(\d+)/ ) { - $chr = $1; - $beg = $2 - 1; # fixedStep files are 1-based - $step = $3; + if ( $chr_end > $chr_beg ) + { + $index_len = $pos - $index_beg; + + push @{ $index{ $chr } }, { + CHR_BEG => $chr_beg, + CHR_END => $chr_end - 1, + INDEX_BEG => $index_beg, + INDEX_LEN => $index_len, + }; + } + + $chr = $1; + $chr_beg = $2 - 1; # fixedStep files are 1-based + $step = $3; + $chr_end = $chr_beg; + + $index_beg = $pos + length $line; } else { - Maasha::Common::error( qq(Could not parse locator: $locator) ); + $chr_end++; } - $pos += length( $locator ) + 11; - - $index_beg = $pos; - -# map { $pos += length( $_ ) + 1 } @{ $entry }; - - $pos += 6 * scalar @{ $entry }; - - $index_len = $pos - $index_beg; - - push @{ $index{ $chr } }, [ $beg, undef, $beg + scalar @{ $entry } - 1, $index_beg, $index_len ]; + $pos += length $line; } - close $fh; + $index_len = $pos - $index_beg; - foreach $chr ( keys %index ) - { - for ( $i = 0; $i < @{ $index{ $chr } } - 1; $i++ ) { - $index{ $chr }->[ $i ]->[ FS_NEXT_CHR_BEG ] = $index{ $chr }->[ $i + 1 ]->[ 0 ]; - } + push @{ $index{ $chr } }, { + CHR_BEG => $chr_beg, + CHR_END => $chr_end - 1, + INDEX_BEG => $index_beg, + INDEX_LEN => $index_len, + }; - $index{ $chr }->[ -1 ]->[ FS_NEXT_CHR_BEG ] = $index{ $chr }->[ -1 ]->[ FS_CHR_END ] + 1; - } + close $fh; return wantarray ? %index : \%index; } @@ -427,129 +451,26 @@ sub fixedstep_index_lookup { # Martin A. Hansen, January 2008. - # Retrieve fixedStep scores from a indexed - # fixedStep file given a chromosome and - # begin and end positions. + # Locate fixedStep entries from index based on + # chr, chr_beg and chr_end. my ( $index, # data structure - $fh, # filehandle to datafile $chr, # chromosome $chr_beg, # chromosome beg $chr_end, # chromosome end - $flank, # include flanking region - OPTIONAL ) = @_; # Returns a list + + my ( @subindex ); - my ( $index_beg, $index_end, $i, $c, $beg, $end, @vals, $scores ); - - $flank ||= 0; - - $chr_beg -= $flank; - $chr_end += $flank; - - # print "chr_beg->$chr_beg chr_end->$chr_end flank->$flank\n"; - - if ( exists $index->{ $chr } ) - { - $index_beg = Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_beg ); - - if ( $index_beg < 0 ) { - Maasha::Common::error( qq(Index search failed - begin index position doesn't exists: $chr_beg) ); - } - - if ( $chr_end < $index->{ $chr }->[ $index_beg ]->[ 1 ] ) - { - $index_end = $index_beg; - } - else - { - $index_end = Maasha::Matrix::interval_search( $index->{ $chr }, 0, 1, $chr_end ); - - if ( $index_end < 0 ) { - Maasha::Common::error( qq(Index search failed - end index position doesn't exists: $chr_end) ); - } - } - - map { $scores->[ $_ ] = 0 } 0 .. $chr_end - $chr_beg; - - if ( $index_beg == $index_end ) - { - $beg = Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ); - $end = Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] ); - - if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] and $end >= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ) - { - @vals = split "\n", Maasha::Filesys::file_read( - $fh, - $index->{ $chr }->[ $index_beg ]->[ FS_INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ), - 6 * ( $end - $beg + 1 ), - ); - } - - for ( $c = 0; $c < @vals; $c++ ) { - $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ]; - } - } - else - { - $beg = Maasha::Calc::max( $chr_beg, $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ); - -# print Dumper( $beg, $index->{ $chr }->[ $index_beg ] ); -# print Dumper( "next", $index->{ $chr }->[ $index_beg ]->[ FS_NEXT_CHR_BEG ] ); - - # beg next - # v v - # |||||||||....... - - if ( $beg <= $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] ) - { - @vals = split "\n", Maasha::Filesys::file_read( - $fh, - $index->{ $chr }->[ $index_beg ]->[ FS_INDEX_BEG ] + 6 * ( $beg - $index->{ $chr }->[ $index_beg ]->[ FS_CHR_BEG ] ), - 6 * ( $index->{ $chr }->[ $index_beg ]->[ FS_CHR_END ] - $beg + 1 ), - ); - - for ( $c = 0; $c < @vals; $c++ ) { - $scores->[ $c + $beg - $chr_beg ] = $vals[ $c ]; - } - } - - $end = Maasha::Calc::min( $chr_end, $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] ); - - if ( $end <= $index->{ $chr }->[ $index_end ]->[ FS_CHR_END ] ) - { - @vals = split "\n", Maasha::Filesys::file_read( - $fh, - $index->{ $chr }->[ $index_end ]->[ FS_INDEX_BEG ], - 6 * ( $end - $index->{ $chr }->[ $index_end ]->[ FS_CHR_BEG ] + 1 ), - ); - - for ( $c = 0; $c < @vals; $c++ ) { - $scores->[ $c + $index->{ $chr }->[ $index_end ]->[ FS_CHR_BEG ] - $chr_beg ] = $vals[ $c ]; - } - } - - for ( $i = $index_beg + 1; $i <= $index_end - 1; $i++ ) - { - @vals = split "\n", Maasha::Filesys::file_read( - $fh, - $index->{ $chr }->[ $i ]->[ FS_INDEX_BEG ], - 6 * ( $index->{ $chr }->[ $i ]->[ FS_CHR_END ] - $index->{ $chr }->[ $i ]->[ FS_CHR_BEG ] + 1 ), - ); - - for ( $c = 0; $c < @vals; $c++ ) { - $scores->[ $c + $index->{ $chr }->[ $i ]->[ FS_CHR_BEG ] - $chr_beg ] = $vals[ $c ]; - } - } - } - } - else - { - Maasha::Common::error( qq(Chromosome "$chr" was not found in index) ); + if ( exists $index->{ $chr } ) { + @subindex = grep { $_->{ 'CHR_END' } >= $chr_beg and $_->{ 'CHR_BEG' } <= $chr_end } @{ $index->{ $chr } }; + } else { + Maasha::Common::error( qq(Chromosome "$chr" not found in index) ); } - return wantarray ? @{ $scores } : $scores; + return wantarray ? @subindex : \@subindex; }