major work on KISS browser completed

[biopieces.git] / code_perl / Maasha / KISS / IO.pm
diff --git a/code_perl/Maasha/KISS/IO.pm b/code_perl/Maasha/KISS/IO.pm

index 4c3ac1498dc85292acc8f56ec8e6c95b2544b45b..f013d2fcdd39037d95137527a013986b625e9917 100644 (file)
--- a/code_perl/Maasha/KISS/IO.pm
+++ b/code_perl/Maasha/KISS/IO.pm
@@ -24,6 +24,8 @@ package Maasha::KISS::IO;
  
  # Routines for parsing and emitting KISS records.
  
+# http://code.google.com/p/biopieces/wiki/KissFormat
+
  
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  
@@ -33,48 +35,30 @@ use strict;
  use Data::Dumper;
  use Maasha::Common;
  use Maasha::Filesys;
+use Maasha::Align;
  use Maasha::SQL;
  use vars qw( @ISA @EXPORT );
  
  @ISA = qw( Exporter );
  
  use constant {
-    S_ID        => 0,
-    S_BEG       => 1,
-    S_END       => 2,
-    Q_ID        => 3,
-    SCORE       => 4,
-    STRAND      => 5,
-    HITS        => 6,
-    ALIGN       => 7,
-    BLOCK_COUNT => 8,
-    BLOCK_BEGS  => 9,
-    BLOCK_LENS  => 10,
-    BLOCK_TYPE  => 11,
+    S_ID             => 0,
+    S_BEG            => 1,
+    S_END            => 2,
+    Q_ID             => 3,
+    SCORE            => 4,
+    STRAND           => 5,
+    HITS             => 6,
+    ALIGN            => 7,
+    BLOCK_COUNT      => 8,
+    BLOCK_BEGS       => 9,
+    BLOCK_LENS       => 10,
+    BLOCK_TYPE       => 11,
+    INDEX_BLOCK_SIZE => 100,
+    INDEX_LEVEL      => 100_000_000,
+    INDEX_FACTOR     => 100,
  };
  
-#      0         1         2
-#      012345678901234567890
-#      ---------------------   S.aur complete genome
-#         -===__===-           TAG_000001
-#         0123456789
-#
-#    S_ID        = 'S.aur complete genome'
-#    S_BEG       = 3
-#    S_END       = 12
-#    Q_ID        = 'TAG_000001'
-#    SCORE       => 1
-#    STRAND      => +
-#    HITS        => 31
-#    ALIGN       => 0:A>T,3:G>C
-#    BLOCK_COUNT => 2
-#    BLOCK_BEGS  => 1,6
-#    BLOCK_LENS  => 3,3
-#    BLOCK_TYPE  => 1,1
-#
-#
-# 'S.aur complete genome'   3   12  'TAG_000001'    1   +   31   2   1,6 3,3    1,1
-
  
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  
@@ -96,7 +80,7 @@ sub kiss_entry_get
  
          @fields = split /\t/, $line;
  
-        Maasha::Common::error( qq( BAD kiss entry: $line) ) if not @fields == 12;
+        Maasha::Common::error( qq(BAD kiss entry: $line) ) if not @fields == 12;
          
          $entry{ 'S_ID' }        = $fields[ S_ID ];
          $entry{ 'S_BEG' }       = $fields[ S_BEG ];
@@ -126,22 +110,28 @@ sub kiss_entry_put
      
      my ( @fields );
  
-    $fh ||= \*STDOUT;
-
-    $fields[ S_ID ]        = $entry->{ 'S_ID' };
-    $fields[ S_BEG ]       = $entry->{ 'S_BEG' };
-    $fields[ S_END ]       = $entry->{ 'S_END' };
-    $fields[ Q_ID ]        = $entry->{ 'Q_ID' };
-    $fields[ SCORE ]       = $entry->{ 'SCORE' };
-    $fields[ STRAND ]      = $entry->{ 'STRAND' };
-    $fields[ HITS ]        = $entry->{ 'HITS' };
-    $fields[ ALIGN ]       = $entry->{ 'ALIGN' };
-    $fields[ BLOCK_COUNT ] = $entry->{ 'BLOCK_COUNT' };
-    $fields[ BLOCK_BEGS ]  = $entry->{ 'BLOCK_BEGS' };
-    $fields[ BLOCK_LENS ]  = $entry->{ 'BLOCK_LENS' };
-    $fields[ BLOCK_TYPE ]  = $entry->{ 'BLOCK_TYPE' };
-
-    print $fh join( "\t", @fields ), "\n";
+    if ( defined $entry->{ 'S_ID' }  and 
+         defined $entry->{ 'S_BEG' } and
+         defined $entry->{ 'S_END' }
+       )
+    {
+        $fh ||= \*STDOUT;
+    
+        $fields[ S_ID ]        = $entry->{ 'S_ID' };
+        $fields[ S_BEG ]       = $entry->{ 'S_BEG' };
+        $fields[ S_END ]       = $entry->{ 'S_END' };
+        $fields[ Q_ID ]        = $entry->{ 'Q_ID' };
+        $fields[ SCORE ]       = $entry->{ 'SCORE' };
+        $fields[ STRAND ]      = $entry->{ 'STRAND' };
+        $fields[ HITS ]        = $entry->{ 'HITS' };
+        $fields[ ALIGN ]       = $entry->{ 'ALIGN' };
+        $fields[ BLOCK_COUNT ] = $entry->{ 'BLOCK_COUNT' };
+        $fields[ BLOCK_BEGS ]  = $entry->{ 'BLOCK_BEGS' };
+        $fields[ BLOCK_LENS ]  = $entry->{ 'BLOCK_LENS' };
+        $fields[ BLOCK_TYPE ]  = $entry->{ 'BLOCK_TYPE' };
+
+        print $fh join( "\t", @fields ), "\n";
+    }
  }
  
  
@@ -166,53 +156,150 @@ sub kiss_sql_get
  
  sub kiss_index
  {
-    # Martin A, Hansen, October 2009.
+    # Martin A, Hansen, November 2009.
  
-    # Creates an index of a sorted KISS file that
-    # allowing the location of the byte position
-    # from where records can be read given a
-    # specific S_BEG position. The index consists of
-    # triples: [ beg, end, bytepos ], where beg and
-    # end denotes the interval where the next KISS
-    # record begins at bytepos.
+    # Creates an index of a sorted KISS file.
  
-    my ( $fh,   # filehandle to KISS file
+    my ( $file,   # KISS file to index
         ) = @_;
  
-    # Returns a list.
+    # Returns a hashref.
  
-    my ( $line, @fields, $beg, $end, $pos, @index );
+    my ( $tree, $offset, $fh, $line, $beg );
  
-    $beg = 0;
-    $pos = 0;
+    $tree   = {};
+    $offset = 0;
+
+    $fh = Maasha::Filesys::file_read_open( $file );
  
      while ( $line = <$fh> )
      {
-        chomp $line;
+        ( undef, $beg ) = split "\t", $line, 3;
  
-        @fields = split /\t/, $line, 3;
+        kiss_index_node_add( $tree, INDEX_LEVEL, INDEX_FACTOR, $beg, $offset );
+                
+        $offset += length $line;
+    }
+
+    close $fh;
+
+    kiss_index_store( "$file.index", $tree );
+}
+
+
+sub kiss_index_node_add
+{
+    # Martin A, Hansen, November 2009.
+
+    # Recursive routine to add nodes to a tree.
+
+    my ( $node,
+         $level,
+         $factor,
+         $beg,
+         $offset,
+         $sum,
+       ) = @_;
+       
+    my ( $bucket );
+    
+    $sum  ||= 0;
+    $bucket = int( $beg / $level );
+    
+    if ( $level >= $factor )
+    {
+        $sum += $bucket * $level;
+        $beg -= $bucket * $level;
          
-        $end = $fields[ S_BEG ];
+        $node->{ 'CHILDREN' }->[ $bucket ]->{ 'COUNT' }++; 
+        # $node->{ 'CHILDREN' }->[ $bucket ]->{ 'LEVEL' }  = $level;
+        # $node->{ 'CHILDREN' }->[ $bucket ]->{ 'BUCKET' } = $bucket;
+        $node->{ 'CHILDREN' }->[ $bucket ]->{ 'BEG' }    = $sum;
+        $node->{ 'CHILDREN' }->[ $bucket ]->{ 'END' }    = $sum + $level - 1;
+        $node->{ 'CHILDREN' }->[ $bucket ]->{ 'OFFSET' } = $offset if not defined $node->{ 'CHILDREN' }->[ $bucket ]->{ 'OFFSET' };
+        
+        kiss_index_node_add( $node->{ 'CHILDREN' }->[ $bucket ], $level / $factor, $factor, $beg, $offset, $sum );
+    }   
+}
  
-        if ( $end == 0 )
-        {
-            push @index, [ $beg, $end, $pos ];
-            $beg = 1;
-        }
-        elsif ( $end > $beg )
+
+sub kiss_index_offset
+{
+    # Martin A. Hansen, November 2009.
+
+    # Given a KISS index and a begin position,
+    # locate the offset closest to the begin position,
+    # and return this.
+    
+    my ( $index,    # KISS index
+         $beg,      # begin position
+         $level,    # index level  - OPTIONAL
+         $factor,   # index factor - OPTIONAL
+       ) = @_;
+
+    # Returns a number.
+
+    my ( $child, $offset );
+
+    $level  ||= INDEX_LEVEL;
+    $factor ||= INDEX_FACTOR;
+
+    foreach $child ( @{ $index->{ 'CHILDREN' } } )
+    {
+        next if not defined $child;
+
+        if ( $child->{ 'BEG' } <= $beg and $beg <= $child->{ 'END' } )
          {
-            push @index, [ $beg, $end - 1, $pos ];
-            $beg = $end;
+            if ( $level == $factor ) {
+                $offset = $child->{ 'OFFSET' };
+            } else {
+                $offset = kiss_index_offset( $child, $beg, $level / $factor, $factor );
+            }
          }
-        elsif( $end < $beg )
+    }
+
+    return $offset;
+}
+
+
+sub kiss_index_count
+{
+    # Martin A. Hansen, November 2009.
+    
+    # Given a KISS index and a begin/end interval
+    # sum the number of counts in that interval,
+    # and return this.
+
+    my ( $index,   # KISS index
+         $beg,     # Begin position
+         $end,     # End position
+         $level,   # index level  - OPTIONAL
+         $factor,  # index factor - OPTIONAL
+       ) = @_;
+
+    # Returns a number.
+    
+    my ( $count, $child );
+
+    $level  ||= INDEX_LEVEL;
+    $factor ||= INDEX_FACTOR;
+    $count  ||= 0;
+
+    foreach $child ( @{ $index->{ 'CHILDREN' } } )
+    {
+        next if not defined $child;
+
+        if ( $level >= $factor )
          {
-            Maasha::Common::error( qq(KISS file not sorted: $end < $beg) );
+            if ( Maasha::Calc::overlap( $beg, $end, $child->{ 'BEG' }, $child->{ 'END' } ) )
+            {
+                $count += $child->{ 'COUNT' } if $level == $factor;
+                $count += kiss_index_count( $child, $beg, $end, $level / $factor, $factor );
+            }
          }
-
-        $pos += 1 + length $line;
      }
  
-    return wantarray ? @index : \@index;
+    return $count;
  }
  
  
@@ -239,65 +326,141 @@ sub kiss_index_retrieve
  }
  
  
-sub kiss_index_search
+sub kiss_index_get_entries
  {
-    my ( $index,
-         $num,
+    my ( $file,
+         $index,
+         $beg,
+         $end,
         ) = @_;
  
-    # Returns a number.
+    my ( $offset, $fh, $entry, @entries );
  
-    my ( $high, $low, $try );
+    $offset = kiss_index_offset( $index, $beg );
  
-    $low  = 0;
-    $high = scalar @{ $index };
+    $fh = Maasha::Filesys::file_read_open( $file );
  
-    while ( $low <= $high )
+    sysseek( $fh, $offset, 0 );
+
+    while ( $entry = kiss_entry_get( $fh ) )
      {
-        $try = int( ( $high + $low ) / 2 );
-    
-        # print "low: $low   high: $high   try: $try\n";
-        
-        if ( $num < $index->[ $try ]->[ 0 ] ) {
-            $high = $try;
-        } elsif ( $num > $index->[ $try ]->[ 1 ] ) {
-            $low = $try + 1;
-        } else {
-            return $index->[ $try ]->[ 2 ];
-        }
+        push @entries, $entry if $entry->{ 'S_END' } > $beg;
+
+        last if $entry->{ 'S_BEG' } > $end;
      }
  
-    Maasha::Common::error( "Could not find number->$num in index" );
+    close $fh;
+
+    return wantarray ? @entries : \@entries;
  }
  
  
-sub kiss_index_get
+sub kiss_index_get_blocks
  {
-    my ( $file,
+    my ( $index,
           $beg,
           $end,
+         $level,   # index level  - OPTIONAL
+         $factor,  # index factor - OPTIONAL
+         $size,
         ) = @_;
  
-    my ( $index, $offset, $fh, $entry, @entries );
+    # Returns a list.
+    
+    my ( $len, @blocks, $child );
+
+    $level  ||= INDEX_LEVEL;
+    $factor ||= INDEX_FACTOR;
+
+    $size ||= 100;   # TODO: lazy list loading?
+
+#    if ( not defined $size )
+#    {
+#        $len = $end - $beg + 1;
+#    
+#        if ( $len > 100_000_000 ) {
+#            $size = 1_000_000;
+#        } elsif ( $len > 1_000_000 ) {
+#            $size = 10_000;
+#        } else {
+#            $size = 100;
+#        }
+#    }
+
+    if ( $level >= $size )
+    {
+        foreach $child ( @{ $index->{ 'CHILDREN' } } )
+        {
+            next if not defined $child;
+
+            if ( Maasha::Calc::overlap( $beg, $end, $child->{ 'BEG' }, $child->{ 'END' } ) )
+            {
+                if ( $level == $size )
+                {
+                    push @blocks, {
+                        BEG   => $child->{ 'BEG' },
+                        END   => $child->{ 'END' },
+                        COUNT => $child->{ 'COUNT' },
+                    };
+                }
+
+                push @blocks, kiss_index_get_blocks( $child, $beg, $end, $level / $factor, $factor, $size );
+            }
+        }
+    }
  
-    $index = Maasha::KISS::IO::kiss_index_retrieve( "$file.index" );
+    return wantarray ? @blocks : \@blocks;
+}
  
-    $offset = Maasha::KISS::IO::kiss_index_search( $index, $beg );
  
-    $fh = Maasha::Filesys::file_read_open( $file );
+sub kiss_align
+{
+    # S.aur_COL       41      75      5_vnlh2ywXsN1/1 1       -       .       17:A>T,31:A>N   .       .       .       .
  
-    sysseek( $fh, $offset, 0 );
+    my ( $s_seqref,   # subject sequence reference
+         $entry,      # KISS entry
+       ) = @_;
  
-    while ( $entry = kiss_entry_get( $fh ) )
-    {
-        push @entries, $entry;
+    # Returns a string.
  
-        last if $entry->{ 'S_END' } > $end;
-    }
+    my ( $align, $pos, $s_symbol, $q_symbol, $s_seq, $q_seq, $insertions );
  
-    close $fh;
+    $s_seq = substr ${ $s_seqref }, $entry->{ 'S_BEG' }, $entry->{ 'S_END' } - $entry->{ 'S_BEG' } + 1;
+    $q_seq = $s_seq;
  
-    return wantarray ? @entries : \@entries;
+    $insertions = 0;
+
+    foreach $align ( split /,/, $entry->{ 'ALIGN' } )
+    {
+        if ( $align =~ /(\d+):(.)>(.)/ )
+        {
+            $pos      = $1;
+            $s_symbol = $2;
+            $q_symbol = $3;
+
+            if ( $s_symbol eq '-' ) # insertion
+            {
+                substr $s_seq, $pos + $insertions, 0, $s_symbol;
+                substr $q_seq, $pos + $insertions, 0, $q_symbol;
+
+                $insertions++;
+            }
+            elsif ( $q_symbol eq '-' ) # deletion
+            {
+                substr $q_seq, $pos + $insertions, 1, $q_symbol;
+            }
+            else # mismatch
+            {
+                substr $q_seq, $pos + $insertions, 1, $q_symbol;
+            }
+        }
+        else
+        {
+            Maasha::Common::error( qq(Bad alignment descriptor: "$align") );
+        }
+    }
+
+    Maasha::Align::align_print_pairwise( [ $entry->{ 'S_ID' }, $s_seq ], [ $entry->{ 'Q_ID' }, $q_seq ] );
  }
  
  
@@ -315,6 +478,7 @@ sub biopiece2kiss
      my ( $record,   # Biopiece record
         ) = @_;
  
+    $record->{ 'SCORE' }       ||= $record->{ 'E_VAL' } || ".";
      $record->{ 'HITS' }        ||= ".";
      $record->{ 'BLOCK_COUNT' } ||= ".";
      $record->{ 'BLOCK_BEGS' }  ||= ".";
@@ -330,4 +494,88 @@ sub biopiece2kiss
  
  1;
  
+__END__
+
+sub kiss_index
+{
+    # Martin A, Hansen, October 2009.
+
+    # Creates an index of a sorted KISS file that
+    # allowing the location of the byte position
+    # from where records can be read given a
+    # specific S_BEG position. The index consists of
+    # triples: [ beg, end, bytepos ], where beg and
+    # end denotes the interval where the next KISS
+    # record begins at bytepos.
+
+    my ( $fh,   # filehandle to KISS file
+       ) = @_;
+
+    # Returns a list.
+
+    my ( $line, @fields, $beg, $end, $pos, @index );
+
+    $beg = 0;
+    $pos = 0;
+
+    while ( $line = <$fh> )
+    {
+        chomp $line;
+
+        @fields = split /\t/, $line, 3;
+        
+        $end = $fields[ S_BEG ];
+
+        if ( $end == 0 )
+        {
+            push @index, [ $beg, $end, $pos ];
+            $beg = 1;
+        }
+        elsif ( $end > $beg )
+        {
+            push @index, [ $beg, $end - 1, $pos ];
+            $beg = $end;
+        }
+        elsif ( $end < $beg )
+        {
+            Maasha::Common::error( qq(KISS file not sorted: beg > end -> $beg > $end) );
+        }
+
+        $pos += 1 + length $line;
+    }
+
+    return wantarray ? @index : \@index;
+}
+
+
+
+sub kiss_index_search
+{
+    my ( $index,
+         $num,
+       ) = @_;
+
+    # Returns a number.
+
+    my ( $high, $low, $try );
+
+    $low  = 0;
+    $high = scalar @{ $index };
+
+    while ( $low <= $high )
+    {
+        $try = int( ( $high + $low ) / 2 );
+    
+        if ( $num < $index->[ $try ]->[ 0 ] ) {
+            $high = $try;
+        } elsif ( $num > $index->[ $try ]->[ 1 ] ) {
+            $low = $try + 1;
+        } else {
+            return $index->[ $try ]->[ 2 ];
+        }
+    }
+
+    Maasha::Common::error( "Could not find number->$num in index" );
+}
+