# Routines for parsing and emitting KISS records.
+# http://code.google.com/p/biopieces/wiki/KissFormat
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
use Data::Dumper;
use Maasha::Common;
use Maasha::Filesys;
+use Maasha::Align;
use Maasha::SQL;
use vars qw( @ISA @EXPORT );
@ISA = qw( Exporter );
use constant {
- S_ID => 0,
- S_BEG => 1,
- S_END => 2,
- Q_ID => 3,
- SCORE => 4,
- STRAND => 5,
- HITS => 6,
- ALIGN => 7,
- BLOCK_COUNT => 8,
- BLOCK_BEGS => 9,
- BLOCK_LENS => 10,
- BLOCK_TYPE => 11,
+ S_ID => 0,
+ S_BEG => 1,
+ S_END => 2,
+ Q_ID => 3,
+ SCORE => 4,
+ STRAND => 5,
+ HITS => 6,
+ ALIGN => 7,
+ BLOCK_COUNT => 8,
+ BLOCK_BEGS => 9,
+ BLOCK_LENS => 10,
+ BLOCK_TYPE => 11,
+ INDEX_BLOCK_SIZE => 100,
+ INDEX_LEVEL => 100_000_000,
+ INDEX_FACTOR => 100,
};
-# 0 1 2
-# 012345678901234567890
-# --------------------- S.aur complete genome
-# -===__===- TAG_000001
-# 0123456789
-#
-# S_ID = 'S.aur complete genome'
-# S_BEG = 3
-# S_END = 12
-# Q_ID = 'TAG_000001'
-# SCORE => 1
-# STRAND => +
-# HITS => 31
-# ALIGN => 0:A>T,3:G>C
-# BLOCK_COUNT => 2
-# BLOCK_BEGS => 1,6
-# BLOCK_LENS => 3,3
-# BLOCK_TYPE => 1,1
-#
-#
-# 'S.aur complete genome' 3 12 'TAG_000001' 1 + 31 2 1,6 3,3 1,1
-
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
@fields = split /\t/, $line;
- Maasha::Common::error( qq( BAD kiss entry: $line) ) if not @fields == 12;
+ Maasha::Common::error( qq(BAD kiss entry: $line) ) if not @fields == 12;
$entry{ 'S_ID' } = $fields[ S_ID ];
$entry{ 'S_BEG' } = $fields[ S_BEG ];
my ( @fields );
- $fh ||= \*STDOUT;
-
- $fields[ S_ID ] = $entry->{ 'S_ID' };
- $fields[ S_BEG ] = $entry->{ 'S_BEG' };
- $fields[ S_END ] = $entry->{ 'S_END' };
- $fields[ Q_ID ] = $entry->{ 'Q_ID' };
- $fields[ SCORE ] = $entry->{ 'SCORE' };
- $fields[ STRAND ] = $entry->{ 'STRAND' };
- $fields[ HITS ] = $entry->{ 'HITS' };
- $fields[ ALIGN ] = $entry->{ 'ALIGN' };
- $fields[ BLOCK_COUNT ] = $entry->{ 'BLOCK_COUNT' };
- $fields[ BLOCK_BEGS ] = $entry->{ 'BLOCK_BEGS' };
- $fields[ BLOCK_LENS ] = $entry->{ 'BLOCK_LENS' };
- $fields[ BLOCK_TYPE ] = $entry->{ 'BLOCK_TYPE' };
-
- print $fh join( "\t", @fields ), "\n";
+ if ( defined $entry->{ 'S_ID' } and
+ defined $entry->{ 'S_BEG' } and
+ defined $entry->{ 'S_END' }
+ )
+ {
+ $fh ||= \*STDOUT;
+
+ $fields[ S_ID ] = $entry->{ 'S_ID' };
+ $fields[ S_BEG ] = $entry->{ 'S_BEG' };
+ $fields[ S_END ] = $entry->{ 'S_END' };
+ $fields[ Q_ID ] = $entry->{ 'Q_ID' };
+ $fields[ SCORE ] = $entry->{ 'SCORE' };
+ $fields[ STRAND ] = $entry->{ 'STRAND' };
+ $fields[ HITS ] = $entry->{ 'HITS' };
+ $fields[ ALIGN ] = $entry->{ 'ALIGN' };
+ $fields[ BLOCK_COUNT ] = $entry->{ 'BLOCK_COUNT' };
+ $fields[ BLOCK_BEGS ] = $entry->{ 'BLOCK_BEGS' };
+ $fields[ BLOCK_LENS ] = $entry->{ 'BLOCK_LENS' };
+ $fields[ BLOCK_TYPE ] = $entry->{ 'BLOCK_TYPE' };
+
+ print $fh join( "\t", @fields ), "\n";
+ }
}
sub kiss_index
{
- # Martin A, Hansen, October 2009.
+ # Martin A, Hansen, November 2009.
- # Creates an index of a sorted KISS file that
- # allowing the location of the byte position
- # from where records can be read given a
- # specific S_BEG position. The index consists of
- # triples: [ beg, end, bytepos ], where beg and
- # end denotes the interval where the next KISS
- # record begins at bytepos.
+ # Creates an index of a sorted KISS file.
- my ( $fh, # filehandle to KISS file
+ my ( $file, # KISS file to index
) = @_;
- # Returns a list.
+ # Returns a hashref.
- my ( $line, @fields, $beg, $end, $pos, @index );
+ my ( $tree, $offset, $fh, $line, $beg );
- $beg = 0;
- $pos = 0;
+ $tree = {};
+ $offset = 0;
+
+ $fh = Maasha::Filesys::file_read_open( $file );
while ( $line = <$fh> )
{
- chomp $line;
+ ( undef, $beg ) = split "\t", $line, 3;
- @fields = split /\t/, $line, 3;
+ kiss_index_node_add( $tree, INDEX_LEVEL, INDEX_FACTOR, $beg, $offset );
+
+ $offset += length $line;
+ }
+
+ close $fh;
+
+ kiss_index_store( "$file.index", $tree );
+}
+
+
+sub kiss_index_node_add
+{
+ # Martin A, Hansen, November 2009.
+
+ # Recursive routine to add nodes to a tree.
+
+ my ( $node,
+ $level,
+ $factor,
+ $beg,
+ $offset,
+ $sum,
+ ) = @_;
+
+ my ( $bucket );
+
+ $sum ||= 0;
+ $bucket = int( $beg / $level );
+
+ if ( $level >= $factor )
+ {
+ $sum += $bucket * $level;
+ $beg -= $bucket * $level;
- $end = $fields[ S_BEG ];
+ $node->{ 'CHILDREN' }->[ $bucket ]->{ 'COUNT' }++;
+ # $node->{ 'CHILDREN' }->[ $bucket ]->{ 'LEVEL' } = $level;
+ # $node->{ 'CHILDREN' }->[ $bucket ]->{ 'BUCKET' } = $bucket;
+ $node->{ 'CHILDREN' }->[ $bucket ]->{ 'BEG' } = $sum;
+ $node->{ 'CHILDREN' }->[ $bucket ]->{ 'END' } = $sum + $level - 1;
+ $node->{ 'CHILDREN' }->[ $bucket ]->{ 'OFFSET' } = $offset if not defined $node->{ 'CHILDREN' }->[ $bucket ]->{ 'OFFSET' };
+
+ kiss_index_node_add( $node->{ 'CHILDREN' }->[ $bucket ], $level / $factor, $factor, $beg, $offset, $sum );
+ }
+}
- if ( $end == 0 )
- {
- push @index, [ $beg, $end, $pos ];
- $beg = 1;
- }
- elsif ( $end > $beg )
+
+sub kiss_index_offset
+{
+ # Martin A. Hansen, November 2009.
+
+ # Given a KISS index and a begin position,
+ # locate the offset closest to the begin position,
+ # and return this.
+
+ my ( $index, # KISS index
+ $beg, # begin position
+ $level, # index level - OPTIONAL
+ $factor, # index factor - OPTIONAL
+ ) = @_;
+
+ # Returns a number.
+
+ my ( $child, $offset );
+
+ $level ||= INDEX_LEVEL;
+ $factor ||= INDEX_FACTOR;
+
+ foreach $child ( @{ $index->{ 'CHILDREN' } } )
+ {
+ next if not defined $child;
+
+ if ( $child->{ 'BEG' } <= $beg and $beg <= $child->{ 'END' } )
{
- push @index, [ $beg, $end - 1, $pos ];
- $beg = $end;
+ if ( $level == $factor ) {
+ $offset = $child->{ 'OFFSET' };
+ } else {
+ $offset = kiss_index_offset( $child, $beg, $level / $factor, $factor );
+ }
}
- elsif( $end < $beg )
+ }
+
+ return $offset;
+}
+
+
+sub kiss_index_count
+{
+ # Martin A. Hansen, November 2009.
+
+ # Given a KISS index and a begin/end interval
+ # sum the number of counts in that interval,
+ # and return this.
+
+ my ( $index, # KISS index
+ $beg, # Begin position
+ $end, # End position
+ $level, # index level - OPTIONAL
+ $factor, # index factor - OPTIONAL
+ ) = @_;
+
+ # Returns a number.
+
+ my ( $count, $child );
+
+ $level ||= INDEX_LEVEL;
+ $factor ||= INDEX_FACTOR;
+ $count ||= 0;
+
+ foreach $child ( @{ $index->{ 'CHILDREN' } } )
+ {
+ next if not defined $child;
+
+ if ( $level >= $factor )
{
- Maasha::Common::error( qq(KISS file not sorted: $end < $beg) );
+ if ( Maasha::Calc::overlap( $beg, $end, $child->{ 'BEG' }, $child->{ 'END' } ) )
+ {
+ $count += $child->{ 'COUNT' } if $level == $factor;
+ $count += kiss_index_count( $child, $beg, $end, $level / $factor, $factor );
+ }
}
-
- $pos += 1 + length $line;
}
- return wantarray ? @index : \@index;
+ return $count;
}
}
-sub kiss_index_search
+sub kiss_index_get_entries
{
- my ( $index,
- $num,
+ my ( $file,
+ $index,
+ $beg,
+ $end,
) = @_;
- # Returns a number.
+ my ( $offset, $fh, $entry, @entries );
- my ( $high, $low, $try );
+ $offset = kiss_index_offset( $index, $beg );
- $low = 0;
- $high = scalar @{ $index };
+ $fh = Maasha::Filesys::file_read_open( $file );
- while ( $low <= $high )
+ sysseek( $fh, $offset, 0 );
+
+ while ( $entry = kiss_entry_get( $fh ) )
{
- $try = int( ( $high + $low ) / 2 );
-
- # print "low: $low high: $high try: $try\n";
-
- if ( $num < $index->[ $try ]->[ 0 ] ) {
- $high = $try;
- } elsif ( $num > $index->[ $try ]->[ 1 ] ) {
- $low = $try + 1;
- } else {
- return $index->[ $try ]->[ 2 ];
- }
+ push @entries, $entry if $entry->{ 'S_END' } > $beg;
+
+ last if $entry->{ 'S_BEG' } > $end;
}
- Maasha::Common::error( "Could not find number->$num in index" );
+ close $fh;
+
+ return wantarray ? @entries : \@entries;
}
-sub kiss_index_get
+sub kiss_index_get_blocks
{
- my ( $file,
+ my ( $index,
$beg,
$end,
+ $level, # index level - OPTIONAL
+ $factor, # index factor - OPTIONAL
+ $size,
) = @_;
- my ( $index, $offset, $fh, $entry, @entries );
+ # Returns a list.
+
+ my ( $len, @blocks, $child );
+
+ $level ||= INDEX_LEVEL;
+ $factor ||= INDEX_FACTOR;
+
+ $size ||= 100; # TODO: lazy list loading?
+
+# if ( not defined $size )
+# {
+# $len = $end - $beg + 1;
+#
+# if ( $len > 100_000_000 ) {
+# $size = 1_000_000;
+# } elsif ( $len > 1_000_000 ) {
+# $size = 10_000;
+# } else {
+# $size = 100;
+# }
+# }
+
+ if ( $level >= $size )
+ {
+ foreach $child ( @{ $index->{ 'CHILDREN' } } )
+ {
+ next if not defined $child;
+
+ if ( Maasha::Calc::overlap( $beg, $end, $child->{ 'BEG' }, $child->{ 'END' } ) )
+ {
+ if ( $level == $size )
+ {
+ push @blocks, {
+ BEG => $child->{ 'BEG' },
+ END => $child->{ 'END' },
+ COUNT => $child->{ 'COUNT' },
+ };
+ }
+
+ push @blocks, kiss_index_get_blocks( $child, $beg, $end, $level / $factor, $factor, $size );
+ }
+ }
+ }
- $index = Maasha::KISS::IO::kiss_index_retrieve( "$file.index" );
+ return wantarray ? @blocks : \@blocks;
+}
- $offset = Maasha::KISS::IO::kiss_index_search( $index, $beg );
- $fh = Maasha::Filesys::file_read_open( $file );
+sub kiss_align
+{
+ # S.aur_COL 41 75 5_vnlh2ywXsN1/1 1 - . 17:A>T,31:A>N . . . .
- sysseek( $fh, $offset, 0 );
+ my ( $s_seqref, # subject sequence reference
+ $entry, # KISS entry
+ ) = @_;
- while ( $entry = kiss_entry_get( $fh ) )
- {
- push @entries, $entry;
+ # Returns a string.
- last if $entry->{ 'S_END' } > $end;
- }
+ my ( $align, $pos, $s_symbol, $q_symbol, $s_seq, $q_seq, $insertions );
- close $fh;
+ $s_seq = substr ${ $s_seqref }, $entry->{ 'S_BEG' }, $entry->{ 'S_END' } - $entry->{ 'S_BEG' } + 1;
+ $q_seq = $s_seq;
- return wantarray ? @entries : \@entries;
+ $insertions = 0;
+
+ foreach $align ( split /,/, $entry->{ 'ALIGN' } )
+ {
+ if ( $align =~ /(\d+):(.)>(.)/ )
+ {
+ $pos = $1;
+ $s_symbol = $2;
+ $q_symbol = $3;
+
+ if ( $s_symbol eq '-' ) # insertion
+ {
+ substr $s_seq, $pos + $insertions, 0, $s_symbol;
+ substr $q_seq, $pos + $insertions, 0, $q_symbol;
+
+ $insertions++;
+ }
+ elsif ( $q_symbol eq '-' ) # deletion
+ {
+ substr $q_seq, $pos + $insertions, 1, $q_symbol;
+ }
+ else # mismatch
+ {
+ substr $q_seq, $pos + $insertions, 1, $q_symbol;
+ }
+ }
+ else
+ {
+ Maasha::Common::error( qq(Bad alignment descriptor: "$align") );
+ }
+ }
+
+ Maasha::Align::align_print_pairwise( [ $entry->{ 'S_ID' }, $s_seq ], [ $entry->{ 'Q_ID' }, $q_seq ] );
}
my ( $record, # Biopiece record
) = @_;
+ $record->{ 'SCORE' } ||= $record->{ 'E_VAL' } || ".";
$record->{ 'HITS' } ||= ".";
$record->{ 'BLOCK_COUNT' } ||= ".";
$record->{ 'BLOCK_BEGS' } ||= ".";
1;
+__END__
+
+sub kiss_index
+{
+ # Martin A, Hansen, October 2009.
+
+ # Creates an index of a sorted KISS file that
+ # allowing the location of the byte position
+ # from where records can be read given a
+ # specific S_BEG position. The index consists of
+ # triples: [ beg, end, bytepos ], where beg and
+ # end denotes the interval where the next KISS
+ # record begins at bytepos.
+
+ my ( $fh, # filehandle to KISS file
+ ) = @_;
+
+ # Returns a list.
+
+ my ( $line, @fields, $beg, $end, $pos, @index );
+
+ $beg = 0;
+ $pos = 0;
+
+ while ( $line = <$fh> )
+ {
+ chomp $line;
+
+ @fields = split /\t/, $line, 3;
+
+ $end = $fields[ S_BEG ];
+
+ if ( $end == 0 )
+ {
+ push @index, [ $beg, $end, $pos ];
+ $beg = 1;
+ }
+ elsif ( $end > $beg )
+ {
+ push @index, [ $beg, $end - 1, $pos ];
+ $beg = $end;
+ }
+ elsif ( $end < $beg )
+ {
+ Maasha::Common::error( qq(KISS file not sorted: beg > end -> $beg > $end) );
+ }
+
+ $pos += 1 + length $line;
+ }
+
+ return wantarray ? @index : \@index;
+}
+
+
+
+sub kiss_index_search
+{
+ my ( $index,
+ $num,
+ ) = @_;
+
+ # Returns a number.
+
+ my ( $high, $low, $try );
+
+ $low = 0;
+ $high = scalar @{ $index };
+
+ while ( $low <= $high )
+ {
+ $try = int( ( $high + $low ) / 2 );
+
+ if ( $num < $index->[ $try ]->[ 0 ] ) {
+ $high = $try;
+ } elsif ( $num > $index->[ $try ]->[ 1 ] ) {
+ $low = $try + 1;
+ } else {
+ return $index->[ $try ]->[ 2 ];
+ }
+ }
+
+ Maasha::Common::error( "Could not find number->$num in index" );
+}
+