working on AlignTwoSeq

author martinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>

Thu, 27 Aug 2009 15:05:05 +0000 (15:05 +0000)

committer martinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>

Thu, 27 Aug 2009 15:05:05 +0000 (15:05 +0000)
author martinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Thu, 27 Aug 2009 15:05:05 +0000 (15:05 +0000)
committer martinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Thu, 27 Aug 2009 15:05:05 +0000 (15:05 +0000)
diff --git a/bp_bin/align_pair_seq b/bp_bin/align_pair_seq

index 1a9d673121ac4a66d3bbc555f3fcc0355b62d2c7..24b8c659bc2523a5eee247d677d89b07bced845b 100755 (executable)
--- a/bp_bin/align_pair_seq
+++ b/bp_bin/align_pair_seq
@@ -30,6 +30,7 @@ use warnings;
  use strict;
  use Maasha::Biopieces;
  use Maasha::AlignTwoSeq;
+use Data::Dumper;
  
  
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
@@ -57,8 +58,10 @@ while ( $record = Maasha::Biopieces::get_record( $in ) )
              $records[ 0 ]->{ 'SEQ' } = lc $records[ 0 ]->{ 'SEQ' };
              $records[ 1 ]->{ 'SEQ' } = lc $records[ 1 ]->{ 'SEQ' };
  
-            my $matches = Maasha::AlignTwoSeq::align_two_seq( \$records[ 0 ]->{ 'SEQ' } ,\$records[ 1 ]->{ 'SEQ' } );
+            my $matches = Maasha::AlignTwoSeq::align_two_seq( { 'Q_SEQ' => \$records[ 0 ]->{ 'SEQ' } , 'S_SEQ' => \$records[ 1 ]->{ 'SEQ' } }, [] );
              
+            print Dumper( $matches );
+
              Maasha::AlignTwoSeq::insert_indels( $matches, \$records[ 0 ]->{ 'SEQ' } ,\$records[ 1 ]->{ 'SEQ' } );
  
              map { Maasha::Biopieces::put_record( $_, $out ) } @records;
diff --git a/code_c/Maasha/src/Makefile b/code_c/Maasha/src/Makefile

index 3521b3a9958acd714248da8988d6589e190df830..bbd83c2f9c44b758144f11c9588d6f1fa9248d03 100644 (file)
--- a/code_c/Maasha/src/Makefile
+++ b/code_c/Maasha/src/Makefile
@@ -12,7 +12,7 @@ INC = -I $(INC_DIR)
  LIB = -lm $(LIB_DIR)*.o
  
  # all: libs utest bed2fixedstep bed2tag_contigs bed_sort bipartite_scan bipartite_decode fasta_count repeat-O-matic
-all: libs bed2fixedstep bed2tag_contigs bed_sort bipartite_scan bipartite_decode fasta_count repeat-O-matic
+all: libs align_two_seq bed2fixedstep bed2tag_contigs bed_sort bipartite_scan bipartite_decode fasta_count repeat-O-matic
  
  libs:
         cd $(LIB_DIR) && ${MAKE} all
@@ -20,6 +20,9 @@ libs:
  utest:
         cd $(TEST_DIR) && ${MAKE} all
  
+align_two_seq: align_two_seq.c
+       $(CC) $(CFLAGS) $(INC) $(LIB) align_two_seq.c -o align_two_seq
+
  bed2fixedstep: bed2fixedstep.c
         $(CC) $(CFLAGS) $(INC) $(LIB) bed2fixedstep.c -o bed2fixedstep
  
diff --git a/code_perl/Maasha/AlignTwoSeq.pm b/code_perl/Maasha/AlignTwoSeq.pm

index 94f438a2daee0b6a68d7bec8512683c98a23cc4b..506f3a25cb689114305ede4295c57dfd42f507e0 100644 (file)
--- a/code_perl/Maasha/AlignTwoSeq.pm
+++ b/code_perl/Maasha/AlignTwoSeq.pm
@@ -22,7 +22,7 @@ package Maasha::AlignTwoSeq;
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  
  
-# yak yak yak
+# Routines to create a pair-wise alignment.
  
  
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
@@ -31,23 +31,10 @@ package Maasha::AlignTwoSeq;
  use warnings;
  use strict;
  use Data::Dumper;
-use Storable qw( dclone );
  use Maasha::Calc;
  use Maasha::Seq;
  use vars qw ( @ISA @EXPORT );
  
-use constant {
-    Q_BEG    => 0,
-    Q_END    => 1,
-    S_BEG    => 2,
-    S_END    => 3,
-    LEN      => 4,
-    SCORE    => 5,
-    HEAD     => 0,
-    SEQ      => 1,
-    ALPH_LEN => 4,
-};
-
  @ISA = qw( Exporter );
  
  
@@ -66,64 +53,43 @@ sub align_two_seq
      # on a calculated score. New search spaces spanning the spaces between
      # matches and the search space boundaries will be cast and recursed into.
      
-    my ( $q_seq,     # sequence 1 ref
-         $s_seq,     # sequence 2 ref
+    # Usage: align_two_seq( { Q_SEQ => \$q_seq, S_SEQ => \$s_seq }, [] );
+
+    my ( $space,     # search space
           $matches,   # list of matches
-         $q_min,     # q sequence start position
-         $q_max,     # q sequecne stop  position
-         $s_min,     # s sequence start position
-         $s_max,     # s sequecne stop  position
         ) = @_;
  
-    # returns a chain of matches that can be chained into an alignment
+    # Returns a list.
  
-    $matches ||= [];
-    $q_min   ||= 0;
-    $s_min   ||= 0;
-    $q_max   ||= length( ${ $q_seq } ) - 1;
-    $s_max   ||= length( ${ $s_seq } ) - 1;
+    my ( @chain, $best_match, $left_space, $right_space );
  
-    my ( $wordsize, @chain, $match, $best_match, @long_matches );
+    new_space( $space );
  
-    $matches = select_matches( $matches, $q_min, $q_max, $s_min, $s_max );
+    matches_select( $matches, $space );
  
      if ( scalar @{ $matches } == 0 )   # no matches - find some!
      {
-        $wordsize = find_wordsize( $q_min, $q_max, $s_min, $s_max );
-        $matches  = find_matches( $q_seq, $s_seq, $wordsize, $q_min, $q_max, $s_min, $s_max );
+        $matches = matches_find( $space );
  
-        while ( scalar @{ $matches } == 0 and $wordsize > 1 )
-        {
-            $wordsize--;
-            $matches = find_matches( $q_seq, $s_seq, $wordsize, $q_min, $q_max, $s_min, $s_max );
-        }
- 
          if ( scalar @{ $matches } > 0 ) {
-            push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $q_max, $s_min, $s_max );
+            push @chain, align_two_seq( $space, $matches );
          }
      }
      else   # matches are included according to score
      {
-        foreach $match ( @{ $matches } ) {
-            $match->[ SCORE ] = score_match( $match, $q_min, $q_max, $s_min, $s_max );
-        }
-
-        @{ $matches } = grep { $_->[ SCORE ] > 0 } @{ $matches };
-        @{ $matches } = sort { $b->[ SCORE ] <=> $a->[ SCORE ] } @{ $matches };
+        $matches = matches_filter( $space, $matches );
  
-        $best_match   = shift @{ $matches };
+        $best_match = shift @{ $matches };
  
          if ( $best_match )
          {
              push @chain, $best_match;
  
-            if ( $best_match->[ Q_BEG ] - $q_min >= 2 and $best_match->[ S_BEG ] - $s_min >= 2 ) {
-                push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $best_match->[ Q_BEG ] - 1, $s_min, $best_match->[ S_BEG ] - 1 );   # left search space
-            }
+            $left_space  = new_space_left( $best_match, $space );
+            $right_space = new_space_right( $best_match, $space );
  
-            if ( $q_max - $best_match->[ Q_END ] >= 2 and $s_max - $best_match->[ S_END ] >= 2 ) {
-                push @chain, align_two_seq( $q_seq, $s_seq, $matches, $best_match->[ Q_END ] + 1, $q_max, $best_match->[ S_END ] + 1, $s_max );   # right search space
-            }
+            push @chain, align_two_seq( $left_space, $matches )  if defined $left_space;
+            push @chain, align_two_seq( $right_space, $matches ) if defined $right_space;
          }
      }
  
@@ -131,7 +97,92 @@ sub align_two_seq
  }
  
  
-sub select_matches
+sub new_space
+{
+    # Martin A. Hansen, August 2009.
+
+    # Define a new search space if not previously
+    # defined. This occurs if align_two_seq() is
+    # called initially with only the sequences.
+
+    my ( $space,   # search space
+       ) = @_;
+
+    # Returns nothing.
+
+    if ( not defined $space->{ 'Q_MAX' } )
+    {
+        $space->{ 'Q_MIN' } = 0;
+        $space->{ 'S_MIN' } = 0;
+        $space->{ 'Q_MAX' } = length( ${ $space->{ 'Q_SEQ' } } ) - 1,
+        $space->{ 'S_MAX' } = length( ${ $space->{ 'S_SEQ' } } ) - 1,
+    }
+}
+
+
+sub new_space_left
+{
+    # Martin A. Hansen, August 2009.
+
+    # Create a new search space between the beginning of
+    # the current search space and the best match.
+
+    my ( $best_match,   # best match
+         $space,        # search space
+       ) = @_;
+
+    # Returns hashref.
+
+    my ( $new_space );
+
+    if ( $best_match->{ 'Q_BEG' } - $space->{ 'Q_MIN' } >= 2 and $best_match->{ 'S_BEG' } - $space->{ 'S_MIN' } >= 2 )
+    {
+        $new_space = {
+            Q_SEQ => $space->{ 'Q_SEQ' },
+            S_SEQ => $space->{ 'S_SEQ' },
+            Q_MIN => $space->{ 'Q_MIN' },
+            Q_MAX => $best_match->{ 'Q_BEG' } - 1,
+            S_MIN => $space->{ 'S_MIN' },
+            S_MAX => $best_match->{ 'S_BEG' } - 1,
+        };
+    }
+
+    return wantarray ? %{ $new_space } : $new_space;
+}
+
+
+sub new_space_right
+{
+    # Martin A. Hansen, August 2009.
+
+    # Create a new search space between the best match
+    # and the end of the current search space.
+
+    my ( $best_match,   # best match
+         $space,        # search space
+       ) = @_;
+
+    # Returns hashref.
+
+    my ( $new_space );
+
+    if ( $space->{ 'Q_MAX' } - $best_match->{ 'Q_END' } >= 2 and $space->{ 'S_MAX' } - $best_match->{ 'S_END' } >= 2 )
+    {
+        $new_space = {
+            Q_SEQ => $space->{ 'Q_SEQ' },
+            S_SEQ => $space->{ 'S_SEQ' },
+            Q_MIN => $best_match->{ 'Q_END' } + 1,
+            Q_MAX => $space->{ 'Q_MAX' },
+            S_MIN => $best_match->{ 'S_END' } + 1,
+            S_MAX => $space->{ 'S_MAX' },
+        };
+    }
+
+    return wantarray ? %{ $new_space } : $new_space;
+}
+
+
+sub matches_select
  {
      # Martin A. Hansen, August 2006.
  
@@ -140,168 +191,113 @@ sub select_matches
      # this search space.
  
      my ( $matches,   # list of matches
-         $q_min,     # q sequence start position
-         $q_max,     # q sequecne stop  position
-         $s_min,     # s sequence start position
-         $s_max,     # s sequecne stop  position
+         $space,     # search space
         ) = @_;
  
-    # returns list of matches
-
-    my ( @matches );
-
-    @matches = grep { $_->[ Q_BEG ] >= $q_min and
-                      $_->[ S_BEG ] >= $s_min and
-                      $_->[ Q_END ] <= $q_max and
-                      $_->[ S_END ] <= $s_max } @{ $matches };
+    # Returns nothing.
  
-    return wantarray ? @matches : \@matches;
+    @{ $matches } = grep { $_->{ 'Q_BEG' } >= $space->{ 'Q_MIN' } and
+                           $_->{ 'S_BEG' } >= $space->{ 'S_MIN' } and
+                           $_->{ 'Q_END' } <= $space->{ 'Q_MAX' } and
+                           $_->{ 'S_END' } <= $space->{ 'S_MAX' } } @{ $matches };
  }
  
  
-sub find_wordsize
+sub word_size_calc
  {
-    # Martin A. Hansen, August 2006.
-
+    # Martin A. Hansen, August 2008.
+  
      # Given a search space calculates the wordsize for a word so a match
      # occurs only a few times. More matches may be needed at low similarity in
      # order to avoid starting with a wrong match.
  
-    my ( $q_min,   # q sequence start position
-         $q_max,   # q sequecne stop position
-         $s_min,   # s sequence start position
-         $s_max,   # s sequecne stop position
+    my ( $space,   # search space
         ) = @_;
  
-    # returns integer
-
-    my ( $q_dim, $s_dim, $dim_min, $wordsize );
+    # Returns integer.
  
-    $q_dim = $q_max - $q_min + 1;
-    $s_dim = $s_max - $s_min + 1;
+    my ( $factor, $word_size );
  
-    $dim_min = Maasha::Calc::min( $q_dim, $s_dim );
+    $factor = 0.05;
  
-    $wordsize = 1;
-
-    if ( $dim_min > 2000000 )               # optimized for DNA
-    {
-        $wordsize = 50;
-    }
-    elsif ( $dim_min > 100000 )             # optimized for DNA
-    {
-        $wordsize = 25;
-    }
-    elsif ( $q_dim > 100 or $s_dim > 100 )  # optimized for DNA
-    {
-        while ( ALPH_LEN ** $wordsize <= $q_dim * $s_dim and $wordsize < $dim_min ) {
-            $wordsize++;
-        }
-    }
-    else
-    {
-        while ( ALPH_LEN ** $wordsize <= $dim_min and $wordsize < $dim_min ) {
-            $wordsize++;
-        }
+    if ( $space->{ 'Q_MAX' } - $space->{ 'Q_MIN' } < $space->{ 'S_MAX' } - $space->{ 'S_MIN' } ) {
+        $word_size = int( $factor * ( $space->{ 'Q_MAX' } - $space->{ 'Q_MIN' } ) ) + 1;
+    } else {
+        $word_size = int( $factor * ( $space->{ 'S_MAX' } - $space->{ 'S_MIN' } ) ) + 1;
      }
  
-    return $wordsize;
+    return $word_size;
  }
  
  
-sub find_matches
+sub seq_index
  {
-    # Martin A. Hansen, November 2006
+    # Martin A. Hansen, August 2009.
  
-    # given two sequences, find all maximum expanded matches between these
+    # Indexes a query sequence in a specified search space
+    # Overlapping words are saved as keys in an index hash,
+    # and a list of word postions are saved as value.
  
-    my ( $q_seq,      # sequence 1
-         $s_seq,      # sequence 2
-         $wordsize,   # word size
-         $q_min,      # q sequence start position
-         $q_max,      # q sequecne stop  position
-         $s_min,      # s sequence start position
-         $s_max,      # s sequecne stop  position
+    my ( $space,       # search space
+         $word_size,   # word size
         ) = @_;
  
-    # returns list of matches
-
-    my ( $q_beg, $q_word, %word_hash, $s_beg, $s_word, $match, @matches );
+    # Returns a hashref.
+    
+    my ( $i, $word, %index );
  
-    if ( length ${ $s_seq } > length ${ $q_seq } )
+    for ( $i = $space->{ 'Q_MIN' }; $i <= $space->{ 'Q_MAX' } - $word_size + 1; $i++ )
      {
-        for ( $q_beg = $q_min; $q_beg <= $q_max - $wordsize + 1; $q_beg++ )
-        {
-            $q_word = lc substr ${ $q_seq }, $q_beg, $wordsize;
+        $word = uc substr ${ $space->{ 'Q_SEQ' } }, $i, $word_size;
  
-            next if $q_word =~ /n/i; #   DNA/genome optimization
+        next if $word =~ tr/N//;   # Skip sequences with Ns.
  
-            push @{ $word_hash{ $q_word } }, $q_beg;
-        }
+        push @{ $index{ $word } }, $i;
+    }
  
-        for ( $s_beg = $s_min; $s_beg <= $s_max - $wordsize + 1; $s_beg++ )
-        {
-            $s_word = lc substr ${ $s_seq }, $s_beg, $wordsize;
+    return wantarray ? %index : \%index;
+}
  
-            if ( exists $word_hash{ $s_word } )
-            {
-                foreach $q_beg ( @{ $word_hash{ $s_word } } )
-                {
-                    $match = [ $q_beg, $q_beg + $wordsize - 1, $s_beg, $s_beg + $wordsize - 1 ];
-
-                    if ( grep { $match->[ Q_BEG ] >= $_->[ Q_BEG ] and
-                                $match->[ Q_END ] <= $_->[ Q_END ] and
-                                $match->[ S_BEG ] >= $_->[ S_BEG ] and
-                                $match->[ S_END ] <= $_->[ S_END ] } @matches )
-                    {
-                        next;   # match is redundant
-                    }
-                    else
-                    {
-                        $match = expand_match( $q_seq, $s_seq, $match, $q_max, $q_min, $s_max, $s_min );
-                        $match->[ LEN ] = $match->[ Q_END ] - $match->[ Q_BEG ] + 1;
-
-                        push @matches, $match;
-                    }
-                }
-            }
-        }
-    }
-    else
-    {
-        for ( $s_beg = $s_min; $s_beg <= $s_max - $wordsize + 1; $s_beg++ )
-        {
-            $s_word = lc substr ${ $s_seq }, $s_beg, $wordsize;
  
-            next if $s_word =~ /n/i; #   DNA/genome optimization
+sub seq_scan
+{
+    # Martin A. Hansen, August 2009.
  
-            push @{ $word_hash{ $s_word } }, $s_beg;
-        }
+    my ( $index,       # sequence index
+         $space,       # search space
+         $word_size,   # word size
+       ) = @_;
  
-        for ( $q_beg = $q_min; $q_beg <= $q_max - $wordsize + 1; $q_beg++ )
-        {
-            $q_word = lc substr ${ $q_seq }, $q_beg, $wordsize;
+    # Returns a list.
+
+    my ( $i, $word, $pos, $match, @matches, $redundant );
  
-            if ( exists $word_hash{ $q_word } )
+    $redundant = {};
+
+    for ( $i = $space->{ 'S_MIN' }; $i <= $space->{ 'S_MAX' } - $word_size + 1; $i++ )
+    {
+        $word = uc substr ${ $space->{ 'S_SEQ' } }, $i, $word_size;
+
+        if ( exists $index->{ $word } )
+        {
+            foreach $pos ( @{ $index->{ $word } } )
              {
-                foreach $s_beg ( @{ $word_hash{ $q_word } } )
+                $match = {
+                    'Q_BEG' => $pos,
+                    'Q_END' => $pos + $word_size - 1,
+                    'S_BEG' => $i,
+                    'S_END' => $i + $word_size - 1,
+                    'LEN'   => $word_size,
+                    'SCORE' => 0,
+                };
+
+                if ( not match_redundant( $match, $redundant ) )
                  {
-                    $match = [ $q_beg, $q_beg + $wordsize - 1, $s_beg, $s_beg + $wordsize - 1 ];
-
-                    if ( grep { $match->[ Q_BEG ] >= $_->[ Q_BEG ] and
-                                $match->[ Q_END ] <= $_->[ Q_END ] and
-                                $match->[ S_BEG ] >= $_->[ S_BEG ] and
-                                $match->[ S_END ] <= $_->[ S_END ] } @matches )
-                    {
-                        next;   # match is redundant
-                    }
-                    else
-                    {
-                        $match = expand_match( $q_seq, $s_seq, $match, $q_max, $q_min, $s_max, $s_min );
-                        $match->[ LEN ] = $match->[ Q_END ] - $match->[ Q_BEG ] + 1;
-
-                        push @matches, $match;
-                    }
+                    match_expand( $match, $space );
+
+                    push @matches, $match;
+
+                    match_redundant_add( $match, $redundant );
                  }
              }
          }
@@ -311,271 +307,328 @@ sub find_matches
  }
  
  
-sub expand_match
+sub matches_find
  {
-    # Martin A. Hansen, August 2006.
+    # Martin A. Hansen, November 2006
  
-    # Given two sequences and a match, expand the match maximally.
-    # A match is defined like this: [ Q_BEG, Q_END, S_BEG, S_END ]
+    # Given two sequences, find all maximum expanded matches between these.
  
-    my ( $q_seq,   # sequence 1 ref
-         $s_seq,   # sequence 2 ref
-         $match,   # sequence match
-         $q_max,   # q sequecne stop position
-         $q_min,   # q sequence start position
-         $s_max,   # s sequecne stop position
-         $s_min,   # s sequence start position
+    my ( $space,   # search space
         ) = @_;
  
-    # returns match
+    # returns list of matches
  
-    my ( $q_pos, $s_pos );
+    my ( $word_size, $index, $matches );
  
-    # expanding forward
+    $word_size = word_size_calc( $space );
  
-    $q_pos = $match->[ Q_END ] + 1;
-    $s_pos = $match->[ S_END ] + 1;
+    $matches = [];
  
-    while ( $q_pos <= $q_max and $s_pos <= $s_max and lc substr( ${ $q_seq }, $q_pos, 1 ) eq lc substr( ${ $s_seq }, $s_pos, 1 ) )
+    while ( scalar @{ $matches } == 0 and $word_size > 0 )
      {
-        $q_pos++;
-        $s_pos++;
+        $index   = seq_index( $space, $word_size );
+        $matches = seq_scan( $index, $space, $word_size );
+
+        $word_size--;
      }
  
-    $match->[ Q_END ] = $q_pos - 1;
-    $match->[ S_END ] = $s_pos - 1;
+    return wantarray ? @{ $matches } : $matches;
+}
  
-    # expanding backwards
  
-    $q_pos = $match->[ Q_BEG ] - 1;
-    $s_pos = $match->[ S_BEG ] - 1;
+sub match_expand
+{
+    # Martin A. Hansen, August 2009.
  
-    while ( $q_pos >= $q_min and $s_pos >= $s_min and lc substr( ${ $q_seq }, $q_pos, 1 ) eq lc substr( ${ $s_seq }, $s_pos, 1 ) )
-    {
-        $q_pos--;
-        $s_pos--;
-    }
+    # Given a match and a search space hash,
+    # expand the match maximally both forward and
+    # backward direction.
+
+    my ( $match,   # match to expand
+         $space,   # search space
+       ) = @_;
  
-    $match->[ Q_BEG ] = $q_pos + 1;
-    $match->[ S_BEG ] = $s_pos + 1;
+    # Returns nothing.
  
-    return $match;
+    match_expand_forward( $match, $space );
+    match_expand_backward( $match, $space );
  }
  
  
-sub score_match
+sub match_expand_forward
  {
-    # Martin A. Hansen, August 2006
-    
-    # given a match and a search space scores the match according to three criteria:
-    
-    # 1) length of match - the longer the better.
-    # 2) distance to closest corner - the shorter the better.
-    # 3) distance to closest narrow end of the search space - the shorter the better.
-    
-    # each of these scores are divided by search space dimentions, and the total score
-    # is calculated: total = score_len - score_corner - score_narrow
-    
-    # the higher the score, the better the match.
-    
-    my ( $match,   # match
-         $q_min,   # q sequence start position
-         $q_max,   # q sequecne stop  position
-         $s_min,   # s sequence start position
-         $s_max,   # s sequecne stop  position
-       ) = @_;
-    
-    # returns a positive number
-    
-    my ( $q_dim, $s_dim, $score_len, $score_corner, $score_narrow, $score_total, $beg_diag_dist, $end_diag_dist,
-         $min_diag_dist, $score_diag, $beg_narrow_dist, $end_narrow_dist, $max_narrow_dist );
+    # Martin A. Hansen, August 2009.
  
-    # ----- 1) scoring according to match length
+    # Given a match and a search space hash,
+    # expand the match maximally in the forward
+    # direction.
  
-    $score_len = $match->[ LEN ] ** 3;
-
-    # ---- 2) score according to distance away from diagonal
+    my ( $match,   # match to expand
+         $space,   # search space
+       ) = @_;
  
-    $q_dim = $q_max - $q_min + 1;
-    $s_dim = $s_max - $s_min + 1;
+    # Returns nothing.
  
-    if ( $q_dim >= $s_dim ) # s_dim is the narrow end
+    while ( $match->{ 'Q_END' } <= $space->{ 'Q_MAX' } and
+            $match->{ 'S_END' } <= $space->{ 'S_MAX' } and 
+            lc substr( ${ $space->{ 'Q_SEQ' } }, $match->{ 'Q_END' }, 1 ) eq lc substr( ${ $space->{ 'S_SEQ' } }, $match->{ 'S_END' }, 1 ) )
      {
-        $beg_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_min, $s_min, $q_min + $s_dim, $s_min + $s_dim );
-        $end_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_max - $s_dim, $s_max - $s_dim, $q_max, $s_max );
+        $match->{ 'Q_END' }++;
+        $match->{ 'S_END' }++;
+        $match->{ 'LEN' }++;
      }
-    else
+
+    $match->{ 'Q_END' }--;
+    $match->{ 'S_END' }--;
+    $match->{ 'LEN' }--;
+}
+
+
+sub match_expand_backward
+{
+    # Martin A. Hansen, August 2009.
+
+    # Given a match and a search space hash,
+    # expand the match maximally in the backward
+    # direction.
+
+    my ( $match,   # match to expand
+         $space,   # search space
+       ) = @_;
+
+    # Returns nothing.
+
+    while ( $match->{ 'Q_BEG' } >= $space->{ 'Q_MIN' } and
+            $match->{ 'S_BEG' } >= $space->{ 'S_MIN' } and 
+            lc substr( ${ $space->{ 'Q_SEQ' } }, $match->{ 'Q_BEG' }, 1 ) eq lc substr( ${ $space->{ 'S_SEQ' } }, $match->{ 'S_BEG' }, 1 ) )
      {
-        $beg_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_min, $s_min, $q_min + $q_dim, $s_min + $q_dim );
-        $end_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_max - $q_dim, $s_max - $q_dim, $q_max, $s_max );
+        $match->{ 'Q_BEG'}--;
+        $match->{ 'S_BEG'}--;
+        $match->{ 'LEN' }++;
      }
  
-    $min_diag_dist = Maasha::Calc::min( $beg_diag_dist, $end_diag_dist );
+    $match->{ 'Q_BEG'}++;
+    $match->{ 'S_BEG'}++;
+    $match->{ 'LEN' }--;
+}
  
-    $score_diag = 2 * $min_diag_dist ** 2;
  
-    # ----- 3) scoring according to distance to the narrow end of the search space
+sub match_redundant
+{
+    # Martin A. Hansen, August 2009
+    
+    my ( $new_match,       # match
+         $redundant,   # hashref
+       ) = @_;
  
-    if ( $q_dim > $s_dim ) # s_dim is the narrow end
-    {
-        $beg_narrow_dist = $match->[ Q_BEG ] - $q_min;
-        $end_narrow_dist = $q_max - $match->[ Q_BEG ];
+    # Returns boolean.
  
-        $max_narrow_dist = Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist );
-    }
-    elsif ( $q_dim < $s_dim )
-    {
-        $beg_narrow_dist = $match->[ S_BEG ] - $s_min;
-        $end_narrow_dist = $s_max - $match->[ S_BEG ];
+    my ( $match );
  
-        $max_narrow_dist = Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist );
-    }
-    else
+    foreach $match ( @{ $redundant->{ $new_match->{ 'Q_BEG' } } } )
      {
-        $max_narrow_dist = 0;
+        if ( $new_match->{ 'S_BEG' } >= $match->{ 'S_BEG' } and $new_match->{ 'S_END' } <= $match->{ 'S_END' } ) {
+            return 1;
+        }
      }
  
-    $score_narrow = $max_narrow_dist;
+    return 0;
+}
+
+
+sub match_redundant_add
+{
+    # Martin A. Hansen, August 2009
+
+    my ( $match,       # match
+         $redundant,   # hash ref
+       ) = @_;
  
-    $score_total = $score_len - $score_narrow - $score_diag;
+    # Returns nothing.
  
-    return $score_total;
+    map { push @{ $redundant->{ $_ } }, $match } ( $match->{ 'Q_BEG' } .. $match->{ 'Q_END' } );
  }
  
  
-sub score_match_niels
+sub matches_filter
  {
-    # Niels Larsen, June 2004.
-    
-    # Creates a crude "heuristic" attempt of telling how likely it is that a 
-    # given match occurs by chance in a given search space. If sequences are
-    # given their composition is taken into account. The scoring punishes 
-    # distance from diagonal(s) and distance from previous match(es). Scores
-    # range from zero and up, and lower is better. 
-
-    my ( $match,   # Match array
-         $q_seq,   # Either q_seq or s_seq
-         $q_min,   # Lower bound search area (query sequence)
-         $q_max,   # Upper bound search area (query sequence)
-         $s_min,   # Lower bound search area (subject sequence)
-         $s_max,   # Upper bound search area (subject sequence)
-         ) = @_;
-    
-    # Returns a positive number. 
-    
-    my ( $q_beg, $s_beg, $q_end, $s_end, $q_dim, $s_dim, $seq, $pos,
-         $q_delta_beg, $s_delta_beg, $q_delta_end, $s_delta_end, $i,
-         $delta_beg_max, $delta_end_max, $as, $gs, $ts, $cs, $pmatch,
-         $score, $moves, $dist_beg, $dist_end, $seqlen, %chars, $delta,
-         $delta_max );
-    
-    $q_beg = $match->[Q_BEG];
-    $q_end = $match->[Q_END];
-    $s_beg = $match->[S_BEG];
-    $s_end = $match->[S_END];
-    
-    # >>>>>>>>>>>>>>>>>>>>>>> CRUDE INITIAL SCORE <<<<<<<<<<<<<<<<<<<<<< 
+    # Martin A. Hansen, August 2009.
  
-    # Get the probability of a match from the sequence composition (when
-    # match is longer than 20 and sequence is given, otherwise use 0.25)
-    # and raise that to the power of the length. 
+    my ( $space,     # search space
+         $matches,   # list of matches
+       ) = @_;
  
-    if ( $match->[LEN] > 20 and defined $q_seq )
-    {
-        $seq = substr ${ $q_seq }, $q_beg, $q_end-$q_beg+1;
-        $seqlen = length $seq;
+    # Returns a list.
  
-        $chars{"a"} = $chars{"g"} = $chars{"c"} = $chars{"t"} = 0;
+    my ( $best_match, $match, @new_matches );
  
-        for ( $i = 0; $i < $seqlen; $i++ )
+    $best_match = shift @{ $matches };
+
+    match_score( $best_match, $space );
+
+    foreach $match ( @{ $matches } )
+    {
+        match_score( $match, $space );
+
+        if ( $match->{ 'SCORE' } > 0 )
          {
-            $chars{ substr $seq, $i, 1 }++;
+            if ( $match->{ 'SCORE' } > $best_match->{ 'SCORE' } ) {
+                $best_match = $match;
+            } else {
+                push @new_matches, $match;
+            }
          }
-
-        $pmatch = ($chars{"a"}/$seqlen)**2 + ($chars{"c"}/$seqlen)**2
-                + ($chars{"g"}/$seqlen)**2 + ($chars{"t"}/$seqlen)**2;
-    }
-    else {
-        $pmatch = 0.25;
      }
  
-    $score = $pmatch ** ( $q_end - $q_beg + 1 ); 
+    unshift @new_matches, $best_match if $best_match->{ 'SCORE' } > 0;
  
-#     # Punish by difference in height and width of search space,
+    return wantarray ? @new_matches : \@new_matches;
+}
+
+
+sub match_score
+{
+    # Martin A. Hansen, August 2009
      
-#     $q_dim = $q_max - $q_min + 1;
-#     $s_dim = $s_max - $s_min + 1;
+    # Given a match and a search space scores the match according to three criteria:
+    
+    # 1) length of match - the longer the better.
+    # 2) distance to closest corner - the shorter the better.
+    # 3) distance to closest narrow end of the search space - the shorter the better.
      
-#     if ( $q_dim != $s_dim ) {
-#         $score *= abs ( $q_dim - $s_dim ) ** 2; 
-#     }
+    # Each of these scores are divided by search space dimensions, and the total score
+    # is calculated: total = score_len - score_corner - score_narrow.
      
-    return $score if $score > 0.25;
+    # The higher the score, the better the match.
  
-    # Punish by how far the match is to the closest corner of the search
-    # space,
+    my ( $match,   # match
+         $space,   # search space
+       ) = @_;
  
-    $q_delta_beg = $q_beg - $q_min;
-    $s_delta_beg = $s_beg - $s_min;
-    
-    $q_delta_end = $q_max - $q_end;
-    $s_delta_end = $s_max - $s_end;
-    
-    if ( $q_delta_beg > $s_delta_beg ) {
-        $delta_beg_max = $q_delta_beg;
-    } else {
-        $delta_beg_max = $s_delta_beg;
-    }
+    # Returns nothing.
  
-    if ( $q_delta_end > $s_delta_end ) {
-        $delta_end_max = $q_delta_end;
-    } else {
-        $delta_end_max = $s_delta_end;
-    }
+    match_score_len( $match );
+    match_score_diag( $match, $space );
+    match_score_narrow( $match, $space );
+}
  
-    if ( $delta_beg_max <= $delta_end_max ) {
-        $score *= ($delta_beg_max+1) ** 2.0;
-    } else {
-        $score *= ($delta_end_max+1) ** 2.0;
-    }
  
-    return $score if $score > 0.25;
+sub match_score_len
+{
+    # Martin A. Hansen, August 2009
+
+    # Score according to match length.
+
+    my ( $match,   # match
+       ) = @_;
+
+    # Returns nothing.
  
-    # Add penalty if the match is towards the narrow end of the
-    # search space,
+    $match->{ 'SCORE' } = $match->{ 'LEN' } ** 3;
+}
+
+
+sub match_score_diag
+{
+    # Martin A. Hansen, August 2009
  
-    if ( ($q_max - $q_min) <= ($s_max - $s_min) )
+    # Score according to distance from diagonal.
+
+    my ( $match,   # match
+         $space,   # search space
+       ) = @_;
+
+    # Returns nothing.
+
+    my ( $q_dim, $s_dim, $beg_diag_dist, $end_diag_dist, $min_diag_dist, $score_diag );
+
+    $q_dim = $space->{ 'Q_MAX' } - $space->{ 'Q_MIN' } + 1;
+    $s_dim = $space->{ 'S_MAX' } - $space->{ 'S_MIN' } + 1;
+
+    if ( $q_dim >= $s_dim ) # s_dim is the narrow end
      {
-        if ( $q_delta_beg > $s_delta_beg )
-        {
-            $score *= 2 * ( $q_delta_beg - $s_delta_beg ) ** 3;
-        }
-        elsif ( $q_delta_end > $s_delta_end )
-        {
-            $score *= 2 * ( $q_delta_end - $s_delta_end ) ** 3;
-        }
+        $beg_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' },
+                                                        $match->{ 'S_BEG' },
+                                                        $space->{ 'Q_MIN' },
+                                                        $space->{ 'S_MIN' },
+                                                        $space->{ 'Q_MIN' } + $s_dim,
+                                                        $space->{ 'S_MIN' } + $s_dim );
+
+        $end_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' },
+                                                        $match->{ 'S_BEG' },
+                                                        $space->{ 'Q_MAX' } - $s_dim,
+                                                        $space->{ 'S_MAX' } - $s_dim,
+                                                        $space->{ 'Q_MAX' },
+                                                        $space->{ 'S_MAX' } );
      }
      else
      {
-        if ( $s_delta_beg > $q_delta_beg )
-        {
-            $score *= 2 * ( $s_delta_beg - $q_delta_beg ) ** 3;
-        }
-        elsif ( $s_delta_end > $q_delta_end )
-        {
-            $score *= 2 * ( $s_delta_end - $q_delta_end ) ** 3;
-        }        
+        $beg_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' },
+                                                        $match->{ 'S_BEG' },
+                                                        $space->{ 'Q_MIN' },
+                                                        $space->{ 'S_MIN' },
+                                                        $space->{ 'Q_MIN' } + $q_dim,
+                                                        $space->{ 'S_MIN' } + $q_dim );
+
+        $end_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' },
+                                                        $match->{ 'S_BEG' },
+                                                        $space->{ 'Q_MAX' } - $q_dim,
+                                                        $space->{ 's_max' } - $q_dim,
+                                                        $space->{ 'Q_MAX' },
+                                                        $space->{ 'S_MAX' } );
      }
  
-    if ( $score < 0 ) {
-        print STDERR "q_min, q_max, s_min, s_max: $q_min, $q_max, $s_min, $s_max\n";
-        die qq (Score <= 0 -> $score);
+    $min_diag_dist = Maasha::Calc::min( $beg_diag_dist, $end_diag_dist );
+
+    $score_diag = 2 * $min_diag_dist ** 2;
+
+    $match->{ 'SCORE' } -= $score_diag;
+}
+
+
+sub match_score_narrow
+{
+    # Martin A. Hansen, August 2009
+
+    # Score according to distance to the narrow end of the search space.
+
+    my ( $match,   # match
+         $space,   # search space
+       ) = @_;
+
+    # Returns nothing.
+
+    my ( $max_narrow_dist, $q_dim, $s_dim, $beg_narrow_dist, $end_narrow_dist, $score_narrow );
+
+    $max_narrow_dist = 0;
+
+    $q_dim = $space->{ 'Q_MAX' } - $space->{ 'Q_MIN' } + 1;
+    $s_dim = $space->{ 'S_MAX' } - $space->{ 'S_MIN' } + 1;
+
+    if ( $q_dim > $s_dim ) # s_dim is the narrow end
+    {
+        $beg_narrow_dist = $match->{ 'Q_BEG' } - $space->{ 'Q_MIN' };
+        $end_narrow_dist = $space->{ 'Q_MAX' } - $match->{ 'Q_BEG' };
+
+        $max_narrow_dist = Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist );
      }
+    elsif ( $q_dim < $s_dim ) # q_dim is the narrow end
+    {
+        $beg_narrow_dist = $match->{ 'S_BEG' } - $space->{ 'S_MIN' };
+        $end_narrow_dist = $space->{ 'S_MAX' } - $match->{ 'S_BEG' };
  
-    return $score;
+        $max_narrow_dist = Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist );
+    }
+
+    $score_narrow = $max_narrow_dist;
+
+    $match->{ 'SCORE' } -= $score_narrow;
  }
  
  
+__END__
+
+
  sub insert_indels
  {
      # Martin A. Hansen, June 2009.
@@ -592,7 +645,7 @@ sub insert_indels
  
      my ( $q_indels, $s_indels, $match, $diff, $first );
  
-    @{ $matches } = sort { $a->[ Q_BEG ] <=> $b->[ Q_BEG ] } @{ $matches };
+    @{ $matches } = sort { $a->{ Q_BEG ] <=> $b->[ Q_BEG ] } @{ $matches };
  
      $first    = 1;
      $q_indels = 0;
diff --git a/code_perl/Maasha/test/test_AlignTwoSeq.pl b/code_perl/Maasha/test/test_AlignTwoSeq.pl

new file mode 100755 (executable)

index 0000000..0211928
--- /dev/null
+++ b/code_perl/Maasha/test/test_AlignTwoSeq.pl
@@ -0,0 +1,517 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use Test::More 'no_plan';
+use Data::Dumper;
+use Maasha::AlignTwoSeq;
+
+test_new_space();
+test_new_space_left();
+test_new_space_right();
+
+test_matches_select();
+
+test_word_size_calc();
+test_seq_index();
+test_seq_scan();
+test_matches_find();
+
+test_match_expand_forward_end_space();
+test_match_expand_forward_end_match();
+test_match_expand_backward_end_space();
+test_match_expand_backward_end_match();
+test_match_expand_end_space();
+test_match_expand_end_match();
+
+test_match_redundant_add();
+test_match_redundant();
+
+test_matches_filter();
+
+test_match_score_narrow();
+test_match_score_diag();
+test_match_score_len();
+test_match_score();
+
+test_align_two_seq();
+
+
+sub test_new_space
+{
+    my $space = { Q_SEQ => \"ATCG", S_SEQ => \"atcg" };
+    
+    Maasha::AlignTwoSeq::new_space( $space );
+
+    is( ${ $space->{ 'Q_SEQ' } }, "ATCG" );
+    is( ${ $space->{ 'S_SEQ' } }, "atcg" );
+    ok( $space->{ 'Q_MIN' } == 0 );
+    ok( $space->{ 'S_MIN' } == 0 );
+    ok( $space->{ 'Q_MAX' } == 3 );
+    ok( $space->{ 'S_MAX' } == 3 );
+}
+
+
+sub test_new_space_left
+{
+    my ( $best_match, $space, $new_space );
+
+    $best_match = {
+        Q_BEG => 2,
+        Q_END => 3,
+        S_BEG => 2,
+        S_END => 3,
+        SCORE => 0,
+        LEN   => 2,
+    };
+
+    $space = {
+        Q_SEQ => \"ATCG",
+        S_SEQ => \"atcg",
+        Q_MIN => 0,
+        S_MIN => 0,
+        Q_MAX => 3,
+        S_MAX => 3,
+    };
+
+    $new_space = Maasha::AlignTwoSeq::new_space_left( $best_match, $space );
+
+    ok( defined $new_space );
+    is( ${ $new_space->{ 'Q_SEQ' } }, "ATCG" );
+    is( ${ $new_space->{ 'S_SEQ' } }, "atcg" );
+    ok( $new_space->{ 'Q_MIN' } == 0 );
+    ok( $new_space->{ 'S_MIN' } == 0 );
+    ok( $new_space->{ 'Q_MAX' } == 1 );
+    ok( $new_space->{ 'S_MAX' } == 1 );
+}
+
+
+sub test_new_space_right
+{
+    my ( $best_match, $space, $new_space );
+
+    $best_match = {
+        Q_BEG => 0,
+        Q_END => 1,
+        S_BEG => 0,
+        S_END => 1,
+        SCORE => 0,
+        LEN   => 2,
+    };
+
+    $space = {
+        Q_SEQ => \"ATCG",
+        S_SEQ => \"atcg",
+        Q_MIN => 0,
+        S_MIN => 0,
+        Q_MAX => 3,
+        S_MAX => 3,
+    };
+
+    $new_space = Maasha::AlignTwoSeq::new_space_right( $best_match, $space );
+
+    ok( defined $new_space );
+    is( ${ $new_space->{ 'Q_SEQ' } }, "ATCG" );
+    is( ${ $new_space->{ 'S_SEQ' } }, "atcg" );
+    ok( $new_space->{ 'Q_MIN' } == 2 );
+    ok( $new_space->{ 'S_MIN' } == 2 );
+    ok( $new_space->{ 'Q_MAX' } == 3 );
+    ok( $new_space->{ 'S_MAX' } == 3 );
+}
+
+
+sub test_matches_select
+{
+    my ( $matches, $space );
+
+    $matches = [
+        { Q_BEG => 9,  S_BEG =>  9, Q_END => 10, S_END => 10 },
+        { Q_BEG => 10, S_BEG => 10, Q_END => 20, S_END => 20 },
+        { Q_BEG => 20, S_BEG => 20, Q_END => 21, S_END => 21 },
+    ];
+
+    $space = {
+        Q_SEQ => \"ATCG",
+        S_SEQ => \"atcg",
+        Q_MIN => 10,
+        S_MIN => 10,
+        Q_MAX => 20,
+        S_MAX => 20,
+    };
+
+    Maasha::AlignTwoSeq::matches_select( $matches, $space );
+
+    ok( scalar @{ $matches } == 1 );
+}
+
+
+sub test_word_size_calc
+{
+    ok( Maasha::AlignTwoSeq::word_size_calc( { Q_MIN => 0, S_MIN => 0, Q_MAX => 1, S_MAX => 1 } ) == 1 );
+    ok( Maasha::AlignTwoSeq::word_size_calc( { Q_MIN => 0, S_MIN => 0, Q_MAX => 200, S_MAX => 200 } ) == 10 + 1 );
+}
+
+
+sub test_seq_index
+{
+    my ( $space, $word_size, $index );
+
+    $space = {
+        Q_SEQ => \"ATCG",
+        S_SEQ => \"atcg",
+        Q_MIN => 0,
+        S_MIN => 0,
+        Q_MAX => 3,
+        S_MAX => 3,
+    };
+
+    $word_size = 2;
+    
+    $index = Maasha::AlignTwoSeq::seq_index( $space, $word_size );
+
+    ok( scalar keys %{ $index } == 3 );
+    ok( exists $index->{ 'AT' } );
+    ok( exists $index->{ 'TC' } );
+    ok( exists $index->{ 'CG' } );
+    ok( $index->{ 'AT' }->[ 0 ] == 0 );
+    ok( $index->{ 'TC' }->[ 0 ] == 1 );
+    ok( $index->{ 'CG' }->[ 0 ] == 2 );
+}
+
+
+sub test_seq_scan
+{
+    my ( $space, $word_size, $index, $matches );
+
+    $space = {
+        Q_SEQ => \"ATCG",
+        S_SEQ => \"atcg",
+        Q_MIN => 0,
+        S_MIN => 0,
+        Q_MAX => 3,
+        S_MAX => 3,
+    };
+
+    $word_size = 2;
+    
+    $index   = Maasha::AlignTwoSeq::seq_index( $space, $word_size );
+    $matches = Maasha::AlignTwoSeq::seq_scan( $index, $space, $word_size );
+
+    ok( scalar @{ $matches } == 1 );
+    ok( $matches->[ 0 ]->{ 'Q_BEG' } == 0 );
+    ok( $matches->[ 0 ]->{ 'S_BEG' } == 0 );
+    ok( $matches->[ 0 ]->{ 'Q_END' } == 3 );
+    ok( $matches->[ 0 ]->{ 'S_END' } == 3 );
+    ok( $matches->[ 0 ]->{ 'LEN' }   == 4 );
+    ok( $matches->[ 0 ]->{ 'SCORE' } == 0 );
+}
+
+
+sub test_matches_find
+{
+    my ( $space, $matches );
+
+    $space = { Q_SEQ => \"ATCG", S_SEQ => \"ATCG", Q_MIN => 0, S_MIN => 0, Q_MAX => 3, S_MAX => 3 };
+
+    $matches = Maasha::AlignTwoSeq::matches_find( $space );
+
+    ok( scalar @{ $matches } == 1 );
+    ok( $matches->[ 0 ]->{ 'Q_BEG' } == 0 );
+    ok( $matches->[ 0 ]->{ 'S_BEG' } == 0 );
+    ok( $matches->[ 0 ]->{ 'Q_END' } == 3 );
+    ok( $matches->[ 0 ]->{ 'S_END' } == 3 );
+    ok( $matches->[ 0 ]->{ 'LEN' }   == 4 );
+
+    $space = { Q_SEQ => \"ATXXGAT", S_SEQ => \"ATCGAT", Q_MIN => 0, S_MIN => 0, Q_MAX => 6, S_MAX => 5 };
+
+    $matches = Maasha::AlignTwoSeq::matches_find( $space );
+
+    ok( scalar @{ $matches } == 4 );
+    ok( $matches->[ 0 ]->{ 'Q_BEG' } == 0 );
+    ok( $matches->[ 0 ]->{ 'S_BEG' } == 0 );
+    ok( $matches->[ 0 ]->{ 'Q_END' } == 1 );
+    ok( $matches->[ 0 ]->{ 'S_END' } == 1 );
+    ok( $matches->[ 0 ]->{ 'LEN' }   == 2 );
+    ok( $matches->[ 1 ]->{ 'Q_BEG' } == 5 );
+    ok( $matches->[ 1 ]->{ 'S_BEG' } == 0 );
+    ok( $matches->[ 1 ]->{ 'Q_END' } == 6 );
+    ok( $matches->[ 1 ]->{ 'S_END' } == 1 );
+    ok( $matches->[ 1 ]->{ 'LEN' }   == 2 );
+    ok( $matches->[ 2 ]->{ 'Q_BEG' } == 4 );
+    ok( $matches->[ 2 ]->{ 'S_BEG' } == 3 );
+    ok( $matches->[ 2 ]->{ 'Q_END' } == 6 );
+    ok( $matches->[ 2 ]->{ 'S_END' } == 5 );
+    ok( $matches->[ 2 ]->{ 'LEN' }   == 3 );
+    ok( $matches->[ 3 ]->{ 'Q_BEG' } == 0 );
+    ok( $matches->[ 3 ]->{ 'S_BEG' } == 4 );
+    ok( $matches->[ 3 ]->{ 'Q_END' } == 1 );
+    ok( $matches->[ 3 ]->{ 'S_END' } == 5 );
+    ok( $matches->[ 3 ]->{ 'LEN' }   == 2 );
+}
+
+sub test_match_expand_forward_end_space
+{
+    my ( $match, $space );
+
+    $match = {
+        Q_BEG => 1,
+        Q_END => 2,
+        S_BEG => 1,
+        S_END => 2,
+        SCORE => 0,
+        LEN   => 2,
+    };
+
+    $space = {
+        Q_SEQ => \"ATCG",
+        S_SEQ => \"atcg",
+        Q_MIN => 0,
+        S_MIN => 0,
+        Q_MAX => 3,
+        S_MAX => 3,
+    };
+
+    Maasha::AlignTwoSeq::match_expand_forward( $match, $space );
+
+    ok( $match->{ 'Q_BEG' } == 1 );
+    ok( $match->{ 'S_BEG' } == 1 );
+    ok( $match->{ 'Q_END' } == 3 );
+    ok( $match->{ 'S_END' } == 3 );
+    ok( $match->{ 'LEN' }   == 3 );
+}
+
+
+sub test_match_expand_forward_end_match
+{
+    my ( $match, $space );
+
+    $match = {
+        Q_BEG => 1,
+        Q_END => 2,
+        S_BEG => 1,
+        S_END => 2,
+        SCORE => 0,
+        LEN   => 2,
+    };
+
+    $space = {
+        Q_SEQ => \"ATCGXX",
+        S_SEQ => \"atcgnn",
+        Q_MIN => 0,
+        S_MIN => 0,
+        Q_MAX => 6,
+        S_MAX => 6,
+    };
+
+    Maasha::AlignTwoSeq::match_expand_forward( $match, $space );
+
+    ok( $match->{ 'Q_BEG' } == 1 );
+    ok( $match->{ 'S_BEG' } == 1 );
+    ok( $match->{ 'Q_END' } == 3 );
+    ok( $match->{ 'S_END' } == 3 );
+    ok( $match->{ 'LEN' }   == 3 );
+}
+
+
+sub test_match_expand_backward_end_space
+{
+    my ( $match, $space );
+
+    $match = {
+        Q_BEG => 1,
+        Q_END => 2,
+        S_BEG => 1,
+        S_END => 2,
+        SCORE => 0,
+        LEN   => 2,
+    };
+
+    $space = {
+        Q_SEQ => \"ATCG",
+        S_SEQ => \"atcg",
+        Q_MIN => 0,
+        S_MIN => 0,
+        Q_MAX => 3,
+        S_MAX => 3,
+    };
+
+    Maasha::AlignTwoSeq::match_expand_backward( $match, $space );
+
+    ok( $match->{ 'Q_BEG' } == 0 );
+    ok( $match->{ 'S_BEG' } == 0 );
+    ok( $match->{ 'Q_END' } == 2 );
+    ok( $match->{ 'S_END' } == 2 );
+    ok( $match->{ 'LEN' }   == 3 );
+}
+
+
+sub test_match_expand_backward_end_match
+{
+    my ( $match, $space );
+
+    $match = {
+        Q_BEG => 2,
+        Q_END => 3,
+        S_BEG => 2,
+        S_END => 3,
+        SCORE => 0,
+        LEN   => 2,
+    };
+
+    $space = {
+        Q_SEQ => \"XATCG",
+        S_SEQ => \"natcg",
+        Q_MIN => 0,
+        S_MIN => 0,
+        Q_MAX => 4,
+        S_MAX => 4,
+    };
+
+    Maasha::AlignTwoSeq::match_expand_backward( $match, $space );
+
+    ok( $match->{ 'Q_BEG' } == 1 );
+    ok( $match->{ 'S_BEG' } == 1 );
+    ok( $match->{ 'Q_END' } == 3 );
+    ok( $match->{ 'S_END' } == 3 );
+    ok( $match->{ 'LEN' }   == 3 );
+}
+
+
+sub test_match_expand_end_space
+{
+    my ( $match, $space );
+
+    $match = {
+        Q_BEG => 1,
+        Q_END => 2,
+        S_BEG => 1,
+        S_END => 2,
+        SCORE => 0,
+        LEN   => 2,
+    };
+
+    $space = {
+        Q_SEQ => \"ATCG",
+        S_SEQ => \"atcg",
+        Q_MIN => 0,
+        S_MIN => 0,
+        Q_MAX => 3,
+        S_MAX => 3,
+    };
+
+    Maasha::AlignTwoSeq::match_expand( $match, $space );
+
+    ok( $match->{ 'Q_BEG' } == 0 );
+    ok( $match->{ 'S_BEG' } == 0 );
+    ok( $match->{ 'Q_END' } == 3 );
+    ok( $match->{ 'S_END' } == 3 );
+    ok( $match->{ 'LEN' }   == 4 );
+}
+
+
+sub test_match_expand_end_match
+{
+    my ( $match, $space );
+
+    $match = {
+        Q_BEG => 2,
+        Q_END => 3,
+        S_BEG => 2,
+        S_END => 3,
+        SCORE => 0,
+        LEN   => 2,
+    };
+
+    $space = {
+        Q_SEQ => \"XATCGX",
+        S_SEQ => \"natcgn",
+        Q_MIN => 0,
+        S_MIN => 0,
+        Q_MAX => 6,
+        S_MAX => 6,
+    };
+
+    Maasha::AlignTwoSeq::match_expand( $match, $space );
+
+    ok( $match->{ 'Q_BEG' } == 1 );
+    ok( $match->{ 'S_BEG' } == 1 );
+    ok( $match->{ 'Q_END' } == 4 );
+    ok( $match->{ 'S_END' } == 4 );
+    ok( $match->{ 'LEN' }   == 4 );
+}
+
+
+sub test_match_redundant_add
+{
+    my ( $redundant );
+
+    $redundant = {};
+
+    Maasha::AlignTwoSeq::match_redundant_add( { Q_BEG => 10, Q_END => 20, S_BEG => 110, S_END => 120 }, $redundant );
+    Maasha::AlignTwoSeq::match_redundant_add( { Q_BEG => 15, Q_END => 25, S_BEG => 210, S_END => 220 }, $redundant );
+
+    ok( scalar keys %{ $redundant } == 16 );
+}
+
+
+sub test_match_redundant
+{
+    my ( $redundant );
+
+    $redundant = {};
+
+    Maasha::AlignTwoSeq::match_redundant_add( { Q_BEG => 10, Q_END => 20, S_BEG => 110, S_END => 120 }, $redundant );
+    Maasha::AlignTwoSeq::match_redundant_add( { Q_BEG => 15, Q_END => 25, S_BEG => 210, S_END => 220 }, $redundant );
+
+    ok( Maasha::AlignTwoSeq::match_redundant( { Q_BEG => 10, Q_END => 20, S_BEG => 110, S_END => 120 }, $redundant ) );
+    ok( not Maasha::AlignTwoSeq::match_redundant( { Q_BEG => 1, Q_END => 2, S_BEG => 110, S_END => 120 }, $redundant ) );
+}
+
+
+sub test_matches_filter
+{
+    ok( 0 );
+}
+
+
+sub test_match_score_narrow
+{
+    ok( 0 );
+}
+
+
+sub test_match_score_diag
+{
+    ok( 0 );
+}
+
+
+sub test_match_score_len
+{
+    ok( 0 );
+}
+
+
+sub test_match_score
+{
+    ok( 0 );
+}
+
+
+sub test_align_two_seq
+{
+    my ( $space, $matches );
+
+    $space = {
+        Q_SEQ => \"ATXCG",
+        S_SEQ => \"ATCG",
+    };
+
+    $matches = Maasha::AlignTwoSeq::align_two_seq( $space, [] );
+
+    print Dumper( $matches );
+
+    ok( 0 );
+}
+
+
author	martinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
	Thu, 27 Aug 2009 15:05:05 +0000 (15:05 +0000)
committer	martinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
	Thu, 27 Aug 2009 15:05:05 +0000 (15:05 +0000)
bp_bin/align_pair_seq		patch \| blob \| history
code_c/Maasha/src/Makefile		patch \| blob \| history
code_perl/Maasha/AlignTwoSeq.pm		patch \| blob \| history
code_perl/Maasha/test/test_AlignTwoSeq.pl	[new file with mode: 0755]	patch \| blob