From 766cde1e17284bd29e5ca8ec31749665f0f04886 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Thu, 17 Dec 2009 13:56:25 +0000 Subject: [PATCH] simplified BGB index scheme git-svn-id: http://biopieces.googlecode.com/svn/trunk@825 74ccb610-7750-0410-82ae-013aeee3265d --- code_perl/Maasha/KISS.pm | 361 ++++++++++++++++----------------------- 1 file changed, 144 insertions(+), 217 deletions(-) diff --git a/code_perl/Maasha/KISS.pm b/code_perl/Maasha/KISS.pm index d9b2824..85a8105 100644 --- a/code_perl/Maasha/KISS.pm +++ b/code_perl/Maasha/KISS.pm @@ -56,6 +56,10 @@ use constant { INDEX_BLOCK_SIZE => 100, INDEX_LEVEL => 100_000_000, INDEX_FACTOR => 100, + + BUCKET_SIZE => 100, + COUNT => 0, + OFFSET => 1, }; @@ -197,168 +201,76 @@ sub kiss_sort } -sub kiss_intersect +sub kiss_index { # Martin A. Hansen, December 2009. - # Given filehandles to two different unsorted KISS files - # intersect the entries so that all entries from file1 that - # overlap entries in file2 are returned - unless the inverse flag - # is given in which case entreis from file1 that does not - # overlap any entries in file2 are returned. + # Creates a lookup index of a sorted KISS file. - my ( $fh1, # filehandle to file1 - $fh2, # filehandle to file2 - $inverse, # flag indicating inverse matching - OPTIONAL + my ( $file, # path to KISS file ) = @_; - # Returns a list - - my ( $entry, %lookup, $pos, $overlap, @entries ); - - while ( $entry = kiss_entry_get( $fh2 ) ) { - map { $lookup{ $_ } = 1 } ( $entry->{ 'S_BEG' } .. $entry->{ 'S_END' } ); - } - - while ( $entry = kiss_entry_get( $fh1 ) ) - { - $overlap = 0; - - foreach $pos ( $entry->{ 'S_BEG' } .. $entry->{ 'S_END' } ) - { - if ( exists $lookup{ $pos } ) - { - $overlap = 1; - - last; - } - } - - if ( $overlap and not $inverse ) { - push @entries, $entry; - } elsif ( not $overlap and $inverse ) { - push @entries, $entry; - } - } - - return wantarray ? @entries : \@entries; -} - - -sub kiss_index -{ - # Martin A, Hansen, November 2009. - - # Creates an index of a sorted KISS file. - - my ( $file, # KISS file - ) = @_; + # Returns nothing. - # Returns a hashref. + my ( $fh, $offset, $line, $s_beg, $bucket, $index ); - my ( $tree, $offset, $fh, $line, $beg ); + $fh = Maasha::Filesys::file_read_open( $file ); - $tree = {}; $offset = 0; - $fh = Maasha::Filesys::file_read_open( $file ); - while ( $line = <$fh> ) { - ( undef, $beg ) = split "\t", $line, 3; + ( undef, $s_beg ) = split "\t", $line, 3; + + $bucket = int( $s_beg / BUCKET_SIZE ); + + $index->[ $bucket ]->[ COUNT ]++; + $index->[ $bucket ]->[ OFFSET ] = $offset if not defined $index->[ $bucket ]->[ OFFSET ]; - kiss_index_node_add( $tree, INDEX_LEVEL, INDEX_FACTOR, $beg, $offset ); - $offset += length $line; } close $fh; - kiss_index_store( "$file.index", $tree ); -} - - -sub kiss_index_node_add -{ - # Martin A, Hansen, November 2009. - - # Recursive routine to add nodes to a tree. - - my ( $node, - $level, - $factor, - $beg, - $offset, - $sum, - ) = @_; - - my ( $bucket ); - - $sum ||= 0; - $bucket = int( $beg / $level ); - - if ( $level >= $factor ) - { - $sum += $bucket * $level; - $beg -= $bucket * $level; - - $node->{ 'CHILDREN' }->[ $bucket ]->{ 'COUNT' }++; - # $node->{ 'CHILDREN' }->[ $bucket ]->{ 'LEVEL' } = $level; - # $node->{ 'CHILDREN' }->[ $bucket ]->{ 'BUCKET' } = $bucket; - $node->{ 'CHILDREN' }->[ $bucket ]->{ 'BEG' } = $sum; - $node->{ 'CHILDREN' }->[ $bucket ]->{ 'END' } = $sum + $level - 1; - $node->{ 'CHILDREN' }->[ $bucket ]->{ 'OFFSET' } = $offset if not defined $node->{ 'CHILDREN' }->[ $bucket ]->{ 'OFFSET' }; - - kiss_index_node_add( $node->{ 'CHILDREN' }->[ $bucket ], $level / $factor, $factor, $beg, $offset, $sum ); - } + Maasha::KISS::kiss_index_store( "$file.index", $index ); } sub kiss_index_offset { - # Martin A. Hansen, November 2009. + # Martin A. Hansen, December 2009. # Given a KISS index and a begin position, # locate the offset closest to the begin position, # and return this. - + my ( $index, # KISS index $beg, # begin position - $level, # index level - OPTIONAL - $factor, # index factor - OPTIONAL ) = @_; # Returns a number. - my ( $child, $offset ); + my ( $bucket ); - $level ||= INDEX_LEVEL; - $factor ||= INDEX_FACTOR; + Maasha::Common::error( qq(Negative begin position: "$beg") ) if $beg < 0; - foreach $child ( @{ $index->{ 'CHILDREN' } } ) - { - next if not defined $child; + $bucket = int( $beg / BUCKET_SIZE ); - if ( $child->{ 'BEG' } <= $beg and $beg <= $child->{ 'END' } ) - { - if ( $level == $factor ) { - $offset = $child->{ 'OFFSET' }; - } else { - $offset = kiss_index_offset( $child, $beg, $level / $factor, $factor ); - } - } - } + $bucket = scalar @{ $index } if $bucket > scalar @{ $index }; - # Maasha::Common::error( "No offset" ) if not defined $offset; # FIXME + while ( $bucket >= 0 ) + { + return $index->[ $bucket ]->[ OFFSET ] if defined $index->[ $bucket ]; - return $offset; + $bucket--; + } } sub kiss_index_count { - # Martin A. Hansen, November 2009. - + # Martin A. Hansen, December 2009. + # Given a KISS index and a begin/end interval # sum the number of counts in that interval, # and return this. @@ -366,68 +278,26 @@ sub kiss_index_count my ( $index, # KISS index $beg, # Begin position $end, # End position - $level, # index level - OPTIONAL - $factor, # index factor - OPTIONAL ) = @_; # Returns a number. - - my ( $count, $child ); - - $level ||= INDEX_LEVEL; - $factor ||= INDEX_FACTOR; - $count ||= 0; - - foreach $child ( @{ $index->{ 'CHILDREN' } } ) - { - next if not defined $child; - - if ( $level >= $factor ) - { - if ( Maasha::Calc::overlap( $beg, $end, $child->{ 'BEG' }, $child->{ 'END' } ) ) - { - $count += $child->{ 'COUNT' } if $level == $factor; - $count += kiss_index_count( $child, $beg, $end, $level / $factor, $factor ); - } - } - } - - return $count; -} - - -sub kiss_index_store -{ - # Martin A. Hansen, November 2009. - - # Stores a KISS index to a file. - - my ( $path, # path to KISS index - $index, # KISS index - ) = @_; - - # Returns nothing. - - Maasha::Filesys::file_store( $path, $index ); -} - -sub kiss_index_retrieve -{ - # Martin A. Hansen, November 2009. + my ( $bucket_beg, $bucket_end, $count, $i ); - # Retrieves a KISS index from a file. + Maasha::Common::error( qq(Negative begin position: "$beg") ) if $beg < 0; - my ( $path, # Path to KISS index - ) = @_; + $bucket_beg = int( $beg / BUCKET_SIZE ); + $bucket_end = int( $end / BUCKET_SIZE ); - # Returns a data structure. + $bucket_end = scalar @{ $index } if $bucket_end > scalar @{ $index }; - my ( $index ); + $count = 0; - $index = Maasha::Filesys::file_retrieve( $path ); + for ( $i = $bucket_beg; $i <= $bucket_end; $i++ ) { + $count += $index->[ $i ]->[ COUNT ] if defined $index->[ $i ]; + } - return wantarray ? @{ $index } : $index; + return $count; } @@ -455,7 +325,7 @@ sub kiss_index_get_entries sysseek( $fh, $offset, 0 ); - while ( $entry = kiss_entry_get( $fh ) ) + while ( $entry = Maasha::KISS::kiss_entry_get( $fh ) ) { push @entries, $entry if $entry->{ 'S_END' } > $beg; @@ -470,67 +340,124 @@ sub kiss_index_get_entries sub kiss_index_get_blocks { - # Martin A. Hansen, November 2009. + # Martin A. Hansen, December 2009. - # Given a KISS index recursively traverse - # this into the appropriate node size determined - # by the size of the given beg/end interval. - # Blocks consisting of hashes of BEG/END/COUNT - # are returned from the requested node size. + # Given a KISS index returns blocks from this in a + # given position interval. Blocks consisting of + # hashes of BEG/END/COUNT. my ( $index, # KISS index node $beg, # interval begin $end, # interval end - $level, # index level - OPTIONAL - $factor, # index factor - OPTIONAL - $size, # requested node size ) = @_; # Returns a list. - - my ( $len, @blocks, $child ); - - $level ||= INDEX_LEVEL; - $factor ||= INDEX_FACTOR; - - $size ||= 100; # TODO: lazy list loading? - -# if ( not defined $size ) -# { -# $len = $end - $beg + 1; -# -# if ( $len > 100_000_000 ) { -# $size = 1_000_000; -# } elsif ( $len > 1_000_000 ) { -# $size = 10_000; -# } else { -# $size = 100; -# } -# } - - if ( $level >= $size ) + + my ( $bucket_beg, $bucket_end, $i, @blocks ); + + Maasha::Common::error( qq(Negative begin position: "$beg") ) if $beg < 0; + + $bucket_beg = int( $beg / BUCKET_SIZE ); + $bucket_end = int( $end / BUCKET_SIZE ); + + $bucket_end = scalar @{ $index } if $bucket_end > scalar @{ $index }; + + for ( $i = $bucket_beg; $i <= $bucket_end; $i++ ) { - foreach $child ( @{ $index->{ 'CHILDREN' } } ) + if ( defined $index->[ $i ] ) { - next if not defined $child; + push @blocks, { + BEG => BUCKET_SIZE * $i, + END => BUCKET_SIZE * ( $i + 1 ), + COUNT => $index->[ $i ]->[ COUNT ], + }; + } + } + + return wantarray ? @blocks : \@blocks; +} + + +sub kiss_intersect +{ + # Martin A. Hansen, December 2009. - if ( Maasha::Calc::overlap( $beg, $end, $child->{ 'BEG' }, $child->{ 'END' } ) ) + # Given filehandles to two different unsorted KISS files + # intersect the entries so that all entries from file1 that + # overlap entries in file2 are returned - unless the inverse flag + # is given in which case entreis from file1 that does not + # overlap any entries in file2 are returned. + + my ( $fh1, # filehandle to file1 + $fh2, # filehandle to file2 + $inverse, # flag indicating inverse matching - OPTIONAL + ) = @_; + + # Returns a list + + my ( $entry, %lookup, $pos, $overlap, @entries ); + + while ( $entry = kiss_entry_get( $fh2 ) ) { + map { $lookup{ $_ } = 1 } ( $entry->{ 'S_BEG' } .. $entry->{ 'S_END' } ); + } + + while ( $entry = kiss_entry_get( $fh1 ) ) + { + $overlap = 0; + + foreach $pos ( $entry->{ 'S_BEG' } .. $entry->{ 'S_END' } ) + { + if ( exists $lookup{ $pos } ) { - if ( $level == $size ) - { - push @blocks, { - BEG => $child->{ 'BEG' }, - END => $child->{ 'END' }, - COUNT => $child->{ 'COUNT' }, - }; - } - - push @blocks, kiss_index_get_blocks( $child, $beg, $end, $level / $factor, $factor, $size ); + $overlap = 1; + + last; } } + + if ( $overlap and not $inverse ) { + push @entries, $entry; + } elsif ( not $overlap and $inverse ) { + push @entries, $entry; + } } - return wantarray ? @blocks : \@blocks; + return wantarray ? @entries : \@entries; +} + + +sub kiss_index_store +{ + # Martin A. Hansen, November 2009. + + # Stores a KISS index to a file. + + my ( $path, # path to KISS index + $index, # KISS index + ) = @_; + + # Returns nothing. + + Maasha::Filesys::file_store( $path, $index ); +} + + +sub kiss_index_retrieve +{ + # Martin A. Hansen, November 2009. + + # Retrieves a KISS index from a file. + + my ( $path, # Path to KISS index + ) = @_; + + # Returns a data structure. + + my ( $index ); + + $index = Maasha::Filesys::file_retrieve( $path ); + + return wantarray ? @{ $index } : $index; } @@ -552,7 +479,7 @@ sub kiss_align_enc # Maasha::Common::error( "Sequence lengths don't match" ) if length ${ $s_seq } != length ${ $q_seq }; - if ( length ${ $s_seq } != length ${ $q_seq } ) { # for unknown reasons this situation may occur - TODO + if ( length ${ $s_seq } != length ${ $q_seq } ) { # for unknown reasons this situation may occur - TODO FIXME return wantarray ? () : []; } -- 2.39.5