From 30cc0713754e50b1e832cf166285f65b4cd575c2 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Thu, 27 Aug 2009 15:05:05 +0000 Subject: [PATCH] working on AlignTwoSeq git-svn-id: http://biopieces.googlecode.com/svn/trunk@645 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/align_pair_seq | 5 +- code_c/Maasha/src/Makefile | 5 +- code_perl/Maasha/AlignTwoSeq.pm | 787 ++++++++++++---------- code_perl/Maasha/test/test_AlignTwoSeq.pl | 517 ++++++++++++++ 4 files changed, 945 insertions(+), 369 deletions(-) create mode 100755 code_perl/Maasha/test/test_AlignTwoSeq.pl diff --git a/bp_bin/align_pair_seq b/bp_bin/align_pair_seq index 1a9d673..24b8c65 100755 --- a/bp_bin/align_pair_seq +++ b/bp_bin/align_pair_seq @@ -30,6 +30,7 @@ use warnings; use strict; use Maasha::Biopieces; use Maasha::AlignTwoSeq; +use Data::Dumper; # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @@ -57,8 +58,10 @@ while ( $record = Maasha::Biopieces::get_record( $in ) ) $records[ 0 ]->{ 'SEQ' } = lc $records[ 0 ]->{ 'SEQ' }; $records[ 1 ]->{ 'SEQ' } = lc $records[ 1 ]->{ 'SEQ' }; - my $matches = Maasha::AlignTwoSeq::align_two_seq( \$records[ 0 ]->{ 'SEQ' } ,\$records[ 1 ]->{ 'SEQ' } ); + my $matches = Maasha::AlignTwoSeq::align_two_seq( { 'Q_SEQ' => \$records[ 0 ]->{ 'SEQ' } , 'S_SEQ' => \$records[ 1 ]->{ 'SEQ' } }, [] ); + print Dumper( $matches ); + Maasha::AlignTwoSeq::insert_indels( $matches, \$records[ 0 ]->{ 'SEQ' } ,\$records[ 1 ]->{ 'SEQ' } ); map { Maasha::Biopieces::put_record( $_, $out ) } @records; diff --git a/code_c/Maasha/src/Makefile b/code_c/Maasha/src/Makefile index 3521b3a..bbd83c2 100644 --- a/code_c/Maasha/src/Makefile +++ b/code_c/Maasha/src/Makefile @@ -12,7 +12,7 @@ INC = -I $(INC_DIR) LIB = -lm $(LIB_DIR)*.o # all: libs utest bed2fixedstep bed2tag_contigs bed_sort bipartite_scan bipartite_decode fasta_count repeat-O-matic -all: libs bed2fixedstep bed2tag_contigs bed_sort bipartite_scan bipartite_decode fasta_count repeat-O-matic +all: libs align_two_seq bed2fixedstep bed2tag_contigs bed_sort bipartite_scan bipartite_decode fasta_count repeat-O-matic libs: cd $(LIB_DIR) && ${MAKE} all @@ -20,6 +20,9 @@ libs: utest: cd $(TEST_DIR) && ${MAKE} all +align_two_seq: align_two_seq.c + $(CC) $(CFLAGS) $(INC) $(LIB) align_two_seq.c -o align_two_seq + bed2fixedstep: bed2fixedstep.c $(CC) $(CFLAGS) $(INC) $(LIB) bed2fixedstep.c -o bed2fixedstep diff --git a/code_perl/Maasha/AlignTwoSeq.pm b/code_perl/Maasha/AlignTwoSeq.pm index 94f438a..506f3a2 100644 --- a/code_perl/Maasha/AlignTwoSeq.pm +++ b/code_perl/Maasha/AlignTwoSeq.pm @@ -22,7 +22,7 @@ package Maasha::AlignTwoSeq; # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# yak yak yak +# Routines to create a pair-wise alignment. # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @@ -31,23 +31,10 @@ package Maasha::AlignTwoSeq; use warnings; use strict; use Data::Dumper; -use Storable qw( dclone ); use Maasha::Calc; use Maasha::Seq; use vars qw ( @ISA @EXPORT ); -use constant { - Q_BEG => 0, - Q_END => 1, - S_BEG => 2, - S_END => 3, - LEN => 4, - SCORE => 5, - HEAD => 0, - SEQ => 1, - ALPH_LEN => 4, -}; - @ISA = qw( Exporter ); @@ -66,64 +53,43 @@ sub align_two_seq # on a calculated score. New search spaces spanning the spaces between # matches and the search space boundaries will be cast and recursed into. - my ( $q_seq, # sequence 1 ref - $s_seq, # sequence 2 ref + # Usage: align_two_seq( { Q_SEQ => \$q_seq, S_SEQ => \$s_seq }, [] ); + + my ( $space, # search space $matches, # list of matches - $q_min, # q sequence start position - $q_max, # q sequecne stop position - $s_min, # s sequence start position - $s_max, # s sequecne stop position ) = @_; - # returns a chain of matches that can be chained into an alignment + # Returns a list. - $matches ||= []; - $q_min ||= 0; - $s_min ||= 0; - $q_max ||= length( ${ $q_seq } ) - 1; - $s_max ||= length( ${ $s_seq } ) - 1; + my ( @chain, $best_match, $left_space, $right_space ); - my ( $wordsize, @chain, $match, $best_match, @long_matches ); + new_space( $space ); - $matches = select_matches( $matches, $q_min, $q_max, $s_min, $s_max ); + matches_select( $matches, $space ); if ( scalar @{ $matches } == 0 ) # no matches - find some! { - $wordsize = find_wordsize( $q_min, $q_max, $s_min, $s_max ); - $matches = find_matches( $q_seq, $s_seq, $wordsize, $q_min, $q_max, $s_min, $s_max ); + $matches = matches_find( $space ); - while ( scalar @{ $matches } == 0 and $wordsize > 1 ) - { - $wordsize--; - $matches = find_matches( $q_seq, $s_seq, $wordsize, $q_min, $q_max, $s_min, $s_max ); - } - if ( scalar @{ $matches } > 0 ) { - push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $q_max, $s_min, $s_max ); + push @chain, align_two_seq( $space, $matches ); } } else # matches are included according to score { - foreach $match ( @{ $matches } ) { - $match->[ SCORE ] = score_match( $match, $q_min, $q_max, $s_min, $s_max ); - } - - @{ $matches } = grep { $_->[ SCORE ] > 0 } @{ $matches }; - @{ $matches } = sort { $b->[ SCORE ] <=> $a->[ SCORE ] } @{ $matches }; + $matches = matches_filter( $space, $matches ); - $best_match = shift @{ $matches }; + $best_match = shift @{ $matches }; if ( $best_match ) { push @chain, $best_match; - if ( $best_match->[ Q_BEG ] - $q_min >= 2 and $best_match->[ S_BEG ] - $s_min >= 2 ) { - push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $best_match->[ Q_BEG ] - 1, $s_min, $best_match->[ S_BEG ] - 1 ); # left search space - } + $left_space = new_space_left( $best_match, $space ); + $right_space = new_space_right( $best_match, $space ); - if ( $q_max - $best_match->[ Q_END ] >= 2 and $s_max - $best_match->[ S_END ] >= 2 ) { - push @chain, align_two_seq( $q_seq, $s_seq, $matches, $best_match->[ Q_END ] + 1, $q_max, $best_match->[ S_END ] + 1, $s_max ); # right search space - } + push @chain, align_two_seq( $left_space, $matches ) if defined $left_space; + push @chain, align_two_seq( $right_space, $matches ) if defined $right_space; } } @@ -131,7 +97,92 @@ sub align_two_seq } -sub select_matches +sub new_space +{ + # Martin A. Hansen, August 2009. + + # Define a new search space if not previously + # defined. This occurs if align_two_seq() is + # called initially with only the sequences. + + my ( $space, # search space + ) = @_; + + # Returns nothing. + + if ( not defined $space->{ 'Q_MAX' } ) + { + $space->{ 'Q_MIN' } = 0; + $space->{ 'S_MIN' } = 0; + $space->{ 'Q_MAX' } = length( ${ $space->{ 'Q_SEQ' } } ) - 1, + $space->{ 'S_MAX' } = length( ${ $space->{ 'S_SEQ' } } ) - 1, + } +} + + +sub new_space_left +{ + # Martin A. Hansen, August 2009. + + # Create a new search space between the beginning of + # the current search space and the best match. + + my ( $best_match, # best match + $space, # search space + ) = @_; + + # Returns hashref. + + my ( $new_space ); + + if ( $best_match->{ 'Q_BEG' } - $space->{ 'Q_MIN' } >= 2 and $best_match->{ 'S_BEG' } - $space->{ 'S_MIN' } >= 2 ) + { + $new_space = { + Q_SEQ => $space->{ 'Q_SEQ' }, + S_SEQ => $space->{ 'S_SEQ' }, + Q_MIN => $space->{ 'Q_MIN' }, + Q_MAX => $best_match->{ 'Q_BEG' } - 1, + S_MIN => $space->{ 'S_MIN' }, + S_MAX => $best_match->{ 'S_BEG' } - 1, + }; + } + + return wantarray ? %{ $new_space } : $new_space; +} + + +sub new_space_right +{ + # Martin A. Hansen, August 2009. + + # Create a new search space between the best match + # and the end of the current search space. + + my ( $best_match, # best match + $space, # search space + ) = @_; + + # Returns hashref. + + my ( $new_space ); + + if ( $space->{ 'Q_MAX' } - $best_match->{ 'Q_END' } >= 2 and $space->{ 'S_MAX' } - $best_match->{ 'S_END' } >= 2 ) + { + $new_space = { + Q_SEQ => $space->{ 'Q_SEQ' }, + S_SEQ => $space->{ 'S_SEQ' }, + Q_MIN => $best_match->{ 'Q_END' } + 1, + Q_MAX => $space->{ 'Q_MAX' }, + S_MIN => $best_match->{ 'S_END' } + 1, + S_MAX => $space->{ 'S_MAX' }, + }; + } + + return wantarray ? %{ $new_space } : $new_space; +} + + +sub matches_select { # Martin A. Hansen, August 2006. @@ -140,168 +191,113 @@ sub select_matches # this search space. my ( $matches, # list of matches - $q_min, # q sequence start position - $q_max, # q sequecne stop position - $s_min, # s sequence start position - $s_max, # s sequecne stop position + $space, # search space ) = @_; - # returns list of matches - - my ( @matches ); - - @matches = grep { $_->[ Q_BEG ] >= $q_min and - $_->[ S_BEG ] >= $s_min and - $_->[ Q_END ] <= $q_max and - $_->[ S_END ] <= $s_max } @{ $matches }; + # Returns nothing. - return wantarray ? @matches : \@matches; + @{ $matches } = grep { $_->{ 'Q_BEG' } >= $space->{ 'Q_MIN' } and + $_->{ 'S_BEG' } >= $space->{ 'S_MIN' } and + $_->{ 'Q_END' } <= $space->{ 'Q_MAX' } and + $_->{ 'S_END' } <= $space->{ 'S_MAX' } } @{ $matches }; } -sub find_wordsize +sub word_size_calc { - # Martin A. Hansen, August 2006. - + # Martin A. Hansen, August 2008. + # Given a search space calculates the wordsize for a word so a match # occurs only a few times. More matches may be needed at low similarity in # order to avoid starting with a wrong match. - my ( $q_min, # q sequence start position - $q_max, # q sequecne stop position - $s_min, # s sequence start position - $s_max, # s sequecne stop position + my ( $space, # search space ) = @_; - # returns integer - - my ( $q_dim, $s_dim, $dim_min, $wordsize ); + # Returns integer. - $q_dim = $q_max - $q_min + 1; - $s_dim = $s_max - $s_min + 1; + my ( $factor, $word_size ); - $dim_min = Maasha::Calc::min( $q_dim, $s_dim ); + $factor = 0.05; - $wordsize = 1; - - if ( $dim_min > 2000000 ) # optimized for DNA - { - $wordsize = 50; - } - elsif ( $dim_min > 100000 ) # optimized for DNA - { - $wordsize = 25; - } - elsif ( $q_dim > 100 or $s_dim > 100 ) # optimized for DNA - { - while ( ALPH_LEN ** $wordsize <= $q_dim * $s_dim and $wordsize < $dim_min ) { - $wordsize++; - } - } - else - { - while ( ALPH_LEN ** $wordsize <= $dim_min and $wordsize < $dim_min ) { - $wordsize++; - } + if ( $space->{ 'Q_MAX' } - $space->{ 'Q_MIN' } < $space->{ 'S_MAX' } - $space->{ 'S_MIN' } ) { + $word_size = int( $factor * ( $space->{ 'Q_MAX' } - $space->{ 'Q_MIN' } ) ) + 1; + } else { + $word_size = int( $factor * ( $space->{ 'S_MAX' } - $space->{ 'S_MIN' } ) ) + 1; } - return $wordsize; + return $word_size; } -sub find_matches +sub seq_index { - # Martin A. Hansen, November 2006 + # Martin A. Hansen, August 2009. - # given two sequences, find all maximum expanded matches between these + # Indexes a query sequence in a specified search space + # Overlapping words are saved as keys in an index hash, + # and a list of word postions are saved as value. - my ( $q_seq, # sequence 1 - $s_seq, # sequence 2 - $wordsize, # word size - $q_min, # q sequence start position - $q_max, # q sequecne stop position - $s_min, # s sequence start position - $s_max, # s sequecne stop position + my ( $space, # search space + $word_size, # word size ) = @_; - # returns list of matches - - my ( $q_beg, $q_word, %word_hash, $s_beg, $s_word, $match, @matches ); + # Returns a hashref. + + my ( $i, $word, %index ); - if ( length ${ $s_seq } > length ${ $q_seq } ) + for ( $i = $space->{ 'Q_MIN' }; $i <= $space->{ 'Q_MAX' } - $word_size + 1; $i++ ) { - for ( $q_beg = $q_min; $q_beg <= $q_max - $wordsize + 1; $q_beg++ ) - { - $q_word = lc substr ${ $q_seq }, $q_beg, $wordsize; + $word = uc substr ${ $space->{ 'Q_SEQ' } }, $i, $word_size; - next if $q_word =~ /n/i; # DNA/genome optimization + next if $word =~ tr/N//; # Skip sequences with Ns. - push @{ $word_hash{ $q_word } }, $q_beg; - } + push @{ $index{ $word } }, $i; + } - for ( $s_beg = $s_min; $s_beg <= $s_max - $wordsize + 1; $s_beg++ ) - { - $s_word = lc substr ${ $s_seq }, $s_beg, $wordsize; + return wantarray ? %index : \%index; +} - if ( exists $word_hash{ $s_word } ) - { - foreach $q_beg ( @{ $word_hash{ $s_word } } ) - { - $match = [ $q_beg, $q_beg + $wordsize - 1, $s_beg, $s_beg + $wordsize - 1 ]; - - if ( grep { $match->[ Q_BEG ] >= $_->[ Q_BEG ] and - $match->[ Q_END ] <= $_->[ Q_END ] and - $match->[ S_BEG ] >= $_->[ S_BEG ] and - $match->[ S_END ] <= $_->[ S_END ] } @matches ) - { - next; # match is redundant - } - else - { - $match = expand_match( $q_seq, $s_seq, $match, $q_max, $q_min, $s_max, $s_min ); - $match->[ LEN ] = $match->[ Q_END ] - $match->[ Q_BEG ] + 1; - - push @matches, $match; - } - } - } - } - } - else - { - for ( $s_beg = $s_min; $s_beg <= $s_max - $wordsize + 1; $s_beg++ ) - { - $s_word = lc substr ${ $s_seq }, $s_beg, $wordsize; - next if $s_word =~ /n/i; # DNA/genome optimization +sub seq_scan +{ + # Martin A. Hansen, August 2009. - push @{ $word_hash{ $s_word } }, $s_beg; - } + my ( $index, # sequence index + $space, # search space + $word_size, # word size + ) = @_; - for ( $q_beg = $q_min; $q_beg <= $q_max - $wordsize + 1; $q_beg++ ) - { - $q_word = lc substr ${ $q_seq }, $q_beg, $wordsize; + # Returns a list. + + my ( $i, $word, $pos, $match, @matches, $redundant ); - if ( exists $word_hash{ $q_word } ) + $redundant = {}; + + for ( $i = $space->{ 'S_MIN' }; $i <= $space->{ 'S_MAX' } - $word_size + 1; $i++ ) + { + $word = uc substr ${ $space->{ 'S_SEQ' } }, $i, $word_size; + + if ( exists $index->{ $word } ) + { + foreach $pos ( @{ $index->{ $word } } ) { - foreach $s_beg ( @{ $word_hash{ $q_word } } ) + $match = { + 'Q_BEG' => $pos, + 'Q_END' => $pos + $word_size - 1, + 'S_BEG' => $i, + 'S_END' => $i + $word_size - 1, + 'LEN' => $word_size, + 'SCORE' => 0, + }; + + if ( not match_redundant( $match, $redundant ) ) { - $match = [ $q_beg, $q_beg + $wordsize - 1, $s_beg, $s_beg + $wordsize - 1 ]; - - if ( grep { $match->[ Q_BEG ] >= $_->[ Q_BEG ] and - $match->[ Q_END ] <= $_->[ Q_END ] and - $match->[ S_BEG ] >= $_->[ S_BEG ] and - $match->[ S_END ] <= $_->[ S_END ] } @matches ) - { - next; # match is redundant - } - else - { - $match = expand_match( $q_seq, $s_seq, $match, $q_max, $q_min, $s_max, $s_min ); - $match->[ LEN ] = $match->[ Q_END ] - $match->[ Q_BEG ] + 1; - - push @matches, $match; - } + match_expand( $match, $space ); + + push @matches, $match; + + match_redundant_add( $match, $redundant ); } } } @@ -311,271 +307,328 @@ sub find_matches } -sub expand_match +sub matches_find { - # Martin A. Hansen, August 2006. + # Martin A. Hansen, November 2006 - # Given two sequences and a match, expand the match maximally. - # A match is defined like this: [ Q_BEG, Q_END, S_BEG, S_END ] + # Given two sequences, find all maximum expanded matches between these. - my ( $q_seq, # sequence 1 ref - $s_seq, # sequence 2 ref - $match, # sequence match - $q_max, # q sequecne stop position - $q_min, # q sequence start position - $s_max, # s sequecne stop position - $s_min, # s sequence start position + my ( $space, # search space ) = @_; - # returns match + # returns list of matches - my ( $q_pos, $s_pos ); + my ( $word_size, $index, $matches ); - # expanding forward + $word_size = word_size_calc( $space ); - $q_pos = $match->[ Q_END ] + 1; - $s_pos = $match->[ S_END ] + 1; + $matches = []; - while ( $q_pos <= $q_max and $s_pos <= $s_max and lc substr( ${ $q_seq }, $q_pos, 1 ) eq lc substr( ${ $s_seq }, $s_pos, 1 ) ) + while ( scalar @{ $matches } == 0 and $word_size > 0 ) { - $q_pos++; - $s_pos++; + $index = seq_index( $space, $word_size ); + $matches = seq_scan( $index, $space, $word_size ); + + $word_size--; } - $match->[ Q_END ] = $q_pos - 1; - $match->[ S_END ] = $s_pos - 1; + return wantarray ? @{ $matches } : $matches; +} - # expanding backwards - $q_pos = $match->[ Q_BEG ] - 1; - $s_pos = $match->[ S_BEG ] - 1; +sub match_expand +{ + # Martin A. Hansen, August 2009. - while ( $q_pos >= $q_min and $s_pos >= $s_min and lc substr( ${ $q_seq }, $q_pos, 1 ) eq lc substr( ${ $s_seq }, $s_pos, 1 ) ) - { - $q_pos--; - $s_pos--; - } + # Given a match and a search space hash, + # expand the match maximally both forward and + # backward direction. + + my ( $match, # match to expand + $space, # search space + ) = @_; - $match->[ Q_BEG ] = $q_pos + 1; - $match->[ S_BEG ] = $s_pos + 1; + # Returns nothing. - return $match; + match_expand_forward( $match, $space ); + match_expand_backward( $match, $space ); } -sub score_match +sub match_expand_forward { - # Martin A. Hansen, August 2006 - - # given a match and a search space scores the match according to three criteria: - - # 1) length of match - the longer the better. - # 2) distance to closest corner - the shorter the better. - # 3) distance to closest narrow end of the search space - the shorter the better. - - # each of these scores are divided by search space dimentions, and the total score - # is calculated: total = score_len - score_corner - score_narrow - - # the higher the score, the better the match. - - my ( $match, # match - $q_min, # q sequence start position - $q_max, # q sequecne stop position - $s_min, # s sequence start position - $s_max, # s sequecne stop position - ) = @_; - - # returns a positive number - - my ( $q_dim, $s_dim, $score_len, $score_corner, $score_narrow, $score_total, $beg_diag_dist, $end_diag_dist, - $min_diag_dist, $score_diag, $beg_narrow_dist, $end_narrow_dist, $max_narrow_dist ); + # Martin A. Hansen, August 2009. - # ----- 1) scoring according to match length + # Given a match and a search space hash, + # expand the match maximally in the forward + # direction. - $score_len = $match->[ LEN ] ** 3; - - # ---- 2) score according to distance away from diagonal + my ( $match, # match to expand + $space, # search space + ) = @_; - $q_dim = $q_max - $q_min + 1; - $s_dim = $s_max - $s_min + 1; + # Returns nothing. - if ( $q_dim >= $s_dim ) # s_dim is the narrow end + while ( $match->{ 'Q_END' } <= $space->{ 'Q_MAX' } and + $match->{ 'S_END' } <= $space->{ 'S_MAX' } and + lc substr( ${ $space->{ 'Q_SEQ' } }, $match->{ 'Q_END' }, 1 ) eq lc substr( ${ $space->{ 'S_SEQ' } }, $match->{ 'S_END' }, 1 ) ) { - $beg_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_min, $s_min, $q_min + $s_dim, $s_min + $s_dim ); - $end_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_max - $s_dim, $s_max - $s_dim, $q_max, $s_max ); + $match->{ 'Q_END' }++; + $match->{ 'S_END' }++; + $match->{ 'LEN' }++; } - else + + $match->{ 'Q_END' }--; + $match->{ 'S_END' }--; + $match->{ 'LEN' }--; +} + + +sub match_expand_backward +{ + # Martin A. Hansen, August 2009. + + # Given a match and a search space hash, + # expand the match maximally in the backward + # direction. + + my ( $match, # match to expand + $space, # search space + ) = @_; + + # Returns nothing. + + while ( $match->{ 'Q_BEG' } >= $space->{ 'Q_MIN' } and + $match->{ 'S_BEG' } >= $space->{ 'S_MIN' } and + lc substr( ${ $space->{ 'Q_SEQ' } }, $match->{ 'Q_BEG' }, 1 ) eq lc substr( ${ $space->{ 'S_SEQ' } }, $match->{ 'S_BEG' }, 1 ) ) { - $beg_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_min, $s_min, $q_min + $q_dim, $s_min + $q_dim ); - $end_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_max - $q_dim, $s_max - $q_dim, $q_max, $s_max ); + $match->{ 'Q_BEG'}--; + $match->{ 'S_BEG'}--; + $match->{ 'LEN' }++; } - $min_diag_dist = Maasha::Calc::min( $beg_diag_dist, $end_diag_dist ); + $match->{ 'Q_BEG'}++; + $match->{ 'S_BEG'}++; + $match->{ 'LEN' }--; +} - $score_diag = 2 * $min_diag_dist ** 2; - # ----- 3) scoring according to distance to the narrow end of the search space +sub match_redundant +{ + # Martin A. Hansen, August 2009 + + my ( $new_match, # match + $redundant, # hashref + ) = @_; - if ( $q_dim > $s_dim ) # s_dim is the narrow end - { - $beg_narrow_dist = $match->[ Q_BEG ] - $q_min; - $end_narrow_dist = $q_max - $match->[ Q_BEG ]; + # Returns boolean. - $max_narrow_dist = Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist ); - } - elsif ( $q_dim < $s_dim ) - { - $beg_narrow_dist = $match->[ S_BEG ] - $s_min; - $end_narrow_dist = $s_max - $match->[ S_BEG ]; + my ( $match ); - $max_narrow_dist = Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist ); - } - else + foreach $match ( @{ $redundant->{ $new_match->{ 'Q_BEG' } } } ) { - $max_narrow_dist = 0; + if ( $new_match->{ 'S_BEG' } >= $match->{ 'S_BEG' } and $new_match->{ 'S_END' } <= $match->{ 'S_END' } ) { + return 1; + } } - $score_narrow = $max_narrow_dist; + return 0; +} + + +sub match_redundant_add +{ + # Martin A. Hansen, August 2009 + + my ( $match, # match + $redundant, # hash ref + ) = @_; - $score_total = $score_len - $score_narrow - $score_diag; + # Returns nothing. - return $score_total; + map { push @{ $redundant->{ $_ } }, $match } ( $match->{ 'Q_BEG' } .. $match->{ 'Q_END' } ); } -sub score_match_niels +sub matches_filter { - # Niels Larsen, June 2004. - - # Creates a crude "heuristic" attempt of telling how likely it is that a - # given match occurs by chance in a given search space. If sequences are - # given their composition is taken into account. The scoring punishes - # distance from diagonal(s) and distance from previous match(es). Scores - # range from zero and up, and lower is better. - - my ( $match, # Match array - $q_seq, # Either q_seq or s_seq - $q_min, # Lower bound search area (query sequence) - $q_max, # Upper bound search area (query sequence) - $s_min, # Lower bound search area (subject sequence) - $s_max, # Upper bound search area (subject sequence) - ) = @_; - - # Returns a positive number. - - my ( $q_beg, $s_beg, $q_end, $s_end, $q_dim, $s_dim, $seq, $pos, - $q_delta_beg, $s_delta_beg, $q_delta_end, $s_delta_end, $i, - $delta_beg_max, $delta_end_max, $as, $gs, $ts, $cs, $pmatch, - $score, $moves, $dist_beg, $dist_end, $seqlen, %chars, $delta, - $delta_max ); - - $q_beg = $match->[Q_BEG]; - $q_end = $match->[Q_END]; - $s_beg = $match->[S_BEG]; - $s_end = $match->[S_END]; - - # >>>>>>>>>>>>>>>>>>>>>>> CRUDE INITIAL SCORE <<<<<<<<<<<<<<<<<<<<<< + # Martin A. Hansen, August 2009. - # Get the probability of a match from the sequence composition (when - # match is longer than 20 and sequence is given, otherwise use 0.25) - # and raise that to the power of the length. + my ( $space, # search space + $matches, # list of matches + ) = @_; - if ( $match->[LEN] > 20 and defined $q_seq ) - { - $seq = substr ${ $q_seq }, $q_beg, $q_end-$q_beg+1; - $seqlen = length $seq; + # Returns a list. - $chars{"a"} = $chars{"g"} = $chars{"c"} = $chars{"t"} = 0; + my ( $best_match, $match, @new_matches ); - for ( $i = 0; $i < $seqlen; $i++ ) + $best_match = shift @{ $matches }; + + match_score( $best_match, $space ); + + foreach $match ( @{ $matches } ) + { + match_score( $match, $space ); + + if ( $match->{ 'SCORE' } > 0 ) { - $chars{ substr $seq, $i, 1 }++; + if ( $match->{ 'SCORE' } > $best_match->{ 'SCORE' } ) { + $best_match = $match; + } else { + push @new_matches, $match; + } } - - $pmatch = ($chars{"a"}/$seqlen)**2 + ($chars{"c"}/$seqlen)**2 - + ($chars{"g"}/$seqlen)**2 + ($chars{"t"}/$seqlen)**2; - } - else { - $pmatch = 0.25; } - $score = $pmatch ** ( $q_end - $q_beg + 1 ); + unshift @new_matches, $best_match if $best_match->{ 'SCORE' } > 0; -# # Punish by difference in height and width of search space, + return wantarray ? @new_matches : \@new_matches; +} + + +sub match_score +{ + # Martin A. Hansen, August 2009 -# $q_dim = $q_max - $q_min + 1; -# $s_dim = $s_max - $s_min + 1; + # Given a match and a search space scores the match according to three criteria: + + # 1) length of match - the longer the better. + # 2) distance to closest corner - the shorter the better. + # 3) distance to closest narrow end of the search space - the shorter the better. -# if ( $q_dim != $s_dim ) { -# $score *= abs ( $q_dim - $s_dim ) ** 2; -# } + # Each of these scores are divided by search space dimensions, and the total score + # is calculated: total = score_len - score_corner - score_narrow. - return $score if $score > 0.25; + # The higher the score, the better the match. - # Punish by how far the match is to the closest corner of the search - # space, + my ( $match, # match + $space, # search space + ) = @_; - $q_delta_beg = $q_beg - $q_min; - $s_delta_beg = $s_beg - $s_min; - - $q_delta_end = $q_max - $q_end; - $s_delta_end = $s_max - $s_end; - - if ( $q_delta_beg > $s_delta_beg ) { - $delta_beg_max = $q_delta_beg; - } else { - $delta_beg_max = $s_delta_beg; - } + # Returns nothing. - if ( $q_delta_end > $s_delta_end ) { - $delta_end_max = $q_delta_end; - } else { - $delta_end_max = $s_delta_end; - } + match_score_len( $match ); + match_score_diag( $match, $space ); + match_score_narrow( $match, $space ); +} - if ( $delta_beg_max <= $delta_end_max ) { - $score *= ($delta_beg_max+1) ** 2.0; - } else { - $score *= ($delta_end_max+1) ** 2.0; - } - return $score if $score > 0.25; +sub match_score_len +{ + # Martin A. Hansen, August 2009 + + # Score according to match length. + + my ( $match, # match + ) = @_; + + # Returns nothing. - # Add penalty if the match is towards the narrow end of the - # search space, + $match->{ 'SCORE' } = $match->{ 'LEN' } ** 3; +} + + +sub match_score_diag +{ + # Martin A. Hansen, August 2009 - if ( ($q_max - $q_min) <= ($s_max - $s_min) ) + # Score according to distance from diagonal. + + my ( $match, # match + $space, # search space + ) = @_; + + # Returns nothing. + + my ( $q_dim, $s_dim, $beg_diag_dist, $end_diag_dist, $min_diag_dist, $score_diag ); + + $q_dim = $space->{ 'Q_MAX' } - $space->{ 'Q_MIN' } + 1; + $s_dim = $space->{ 'S_MAX' } - $space->{ 'S_MIN' } + 1; + + if ( $q_dim >= $s_dim ) # s_dim is the narrow end { - if ( $q_delta_beg > $s_delta_beg ) - { - $score *= 2 * ( $q_delta_beg - $s_delta_beg ) ** 3; - } - elsif ( $q_delta_end > $s_delta_end ) - { - $score *= 2 * ( $q_delta_end - $s_delta_end ) ** 3; - } + $beg_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' }, + $match->{ 'S_BEG' }, + $space->{ 'Q_MIN' }, + $space->{ 'S_MIN' }, + $space->{ 'Q_MIN' } + $s_dim, + $space->{ 'S_MIN' } + $s_dim ); + + $end_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' }, + $match->{ 'S_BEG' }, + $space->{ 'Q_MAX' } - $s_dim, + $space->{ 'S_MAX' } - $s_dim, + $space->{ 'Q_MAX' }, + $space->{ 'S_MAX' } ); } else { - if ( $s_delta_beg > $q_delta_beg ) - { - $score *= 2 * ( $s_delta_beg - $q_delta_beg ) ** 3; - } - elsif ( $s_delta_end > $q_delta_end ) - { - $score *= 2 * ( $s_delta_end - $q_delta_end ) ** 3; - } + $beg_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' }, + $match->{ 'S_BEG' }, + $space->{ 'Q_MIN' }, + $space->{ 'S_MIN' }, + $space->{ 'Q_MIN' } + $q_dim, + $space->{ 'S_MIN' } + $q_dim ); + + $end_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' }, + $match->{ 'S_BEG' }, + $space->{ 'Q_MAX' } - $q_dim, + $space->{ 's_max' } - $q_dim, + $space->{ 'Q_MAX' }, + $space->{ 'S_MAX' } ); } - if ( $score < 0 ) { - print STDERR "q_min, q_max, s_min, s_max: $q_min, $q_max, $s_min, $s_max\n"; - die qq (Score <= 0 -> $score); + $min_diag_dist = Maasha::Calc::min( $beg_diag_dist, $end_diag_dist ); + + $score_diag = 2 * $min_diag_dist ** 2; + + $match->{ 'SCORE' } -= $score_diag; +} + + +sub match_score_narrow +{ + # Martin A. Hansen, August 2009 + + # Score according to distance to the narrow end of the search space. + + my ( $match, # match + $space, # search space + ) = @_; + + # Returns nothing. + + my ( $max_narrow_dist, $q_dim, $s_dim, $beg_narrow_dist, $end_narrow_dist, $score_narrow ); + + $max_narrow_dist = 0; + + $q_dim = $space->{ 'Q_MAX' } - $space->{ 'Q_MIN' } + 1; + $s_dim = $space->{ 'S_MAX' } - $space->{ 'S_MIN' } + 1; + + if ( $q_dim > $s_dim ) # s_dim is the narrow end + { + $beg_narrow_dist = $match->{ 'Q_BEG' } - $space->{ 'Q_MIN' }; + $end_narrow_dist = $space->{ 'Q_MAX' } - $match->{ 'Q_BEG' }; + + $max_narrow_dist = Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist ); } + elsif ( $q_dim < $s_dim ) # q_dim is the narrow end + { + $beg_narrow_dist = $match->{ 'S_BEG' } - $space->{ 'S_MIN' }; + $end_narrow_dist = $space->{ 'S_MAX' } - $match->{ 'S_BEG' }; - return $score; + $max_narrow_dist = Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist ); + } + + $score_narrow = $max_narrow_dist; + + $match->{ 'SCORE' } -= $score_narrow; } +__END__ + + sub insert_indels { # Martin A. Hansen, June 2009. @@ -592,7 +645,7 @@ sub insert_indels my ( $q_indels, $s_indels, $match, $diff, $first ); - @{ $matches } = sort { $a->[ Q_BEG ] <=> $b->[ Q_BEG ] } @{ $matches }; + @{ $matches } = sort { $a->{ Q_BEG ] <=> $b->[ Q_BEG ] } @{ $matches }; $first = 1; $q_indels = 0; diff --git a/code_perl/Maasha/test/test_AlignTwoSeq.pl b/code_perl/Maasha/test/test_AlignTwoSeq.pl new file mode 100755 index 0000000..0211928 --- /dev/null +++ b/code_perl/Maasha/test/test_AlignTwoSeq.pl @@ -0,0 +1,517 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use Test::More 'no_plan'; +use Data::Dumper; +use Maasha::AlignTwoSeq; + +test_new_space(); +test_new_space_left(); +test_new_space_right(); + +test_matches_select(); + +test_word_size_calc(); +test_seq_index(); +test_seq_scan(); +test_matches_find(); + +test_match_expand_forward_end_space(); +test_match_expand_forward_end_match(); +test_match_expand_backward_end_space(); +test_match_expand_backward_end_match(); +test_match_expand_end_space(); +test_match_expand_end_match(); + +test_match_redundant_add(); +test_match_redundant(); + +test_matches_filter(); + +test_match_score_narrow(); +test_match_score_diag(); +test_match_score_len(); +test_match_score(); + +test_align_two_seq(); + + +sub test_new_space +{ + my $space = { Q_SEQ => \"ATCG", S_SEQ => \"atcg" }; + + Maasha::AlignTwoSeq::new_space( $space ); + + is( ${ $space->{ 'Q_SEQ' } }, "ATCG" ); + is( ${ $space->{ 'S_SEQ' } }, "atcg" ); + ok( $space->{ 'Q_MIN' } == 0 ); + ok( $space->{ 'S_MIN' } == 0 ); + ok( $space->{ 'Q_MAX' } == 3 ); + ok( $space->{ 'S_MAX' } == 3 ); +} + + +sub test_new_space_left +{ + my ( $best_match, $space, $new_space ); + + $best_match = { + Q_BEG => 2, + Q_END => 3, + S_BEG => 2, + S_END => 3, + SCORE => 0, + LEN => 2, + }; + + $space = { + Q_SEQ => \"ATCG", + S_SEQ => \"atcg", + Q_MIN => 0, + S_MIN => 0, + Q_MAX => 3, + S_MAX => 3, + }; + + $new_space = Maasha::AlignTwoSeq::new_space_left( $best_match, $space ); + + ok( defined $new_space ); + is( ${ $new_space->{ 'Q_SEQ' } }, "ATCG" ); + is( ${ $new_space->{ 'S_SEQ' } }, "atcg" ); + ok( $new_space->{ 'Q_MIN' } == 0 ); + ok( $new_space->{ 'S_MIN' } == 0 ); + ok( $new_space->{ 'Q_MAX' } == 1 ); + ok( $new_space->{ 'S_MAX' } == 1 ); +} + + +sub test_new_space_right +{ + my ( $best_match, $space, $new_space ); + + $best_match = { + Q_BEG => 0, + Q_END => 1, + S_BEG => 0, + S_END => 1, + SCORE => 0, + LEN => 2, + }; + + $space = { + Q_SEQ => \"ATCG", + S_SEQ => \"atcg", + Q_MIN => 0, + S_MIN => 0, + Q_MAX => 3, + S_MAX => 3, + }; + + $new_space = Maasha::AlignTwoSeq::new_space_right( $best_match, $space ); + + ok( defined $new_space ); + is( ${ $new_space->{ 'Q_SEQ' } }, "ATCG" ); + is( ${ $new_space->{ 'S_SEQ' } }, "atcg" ); + ok( $new_space->{ 'Q_MIN' } == 2 ); + ok( $new_space->{ 'S_MIN' } == 2 ); + ok( $new_space->{ 'Q_MAX' } == 3 ); + ok( $new_space->{ 'S_MAX' } == 3 ); +} + + +sub test_matches_select +{ + my ( $matches, $space ); + + $matches = [ + { Q_BEG => 9, S_BEG => 9, Q_END => 10, S_END => 10 }, + { Q_BEG => 10, S_BEG => 10, Q_END => 20, S_END => 20 }, + { Q_BEG => 20, S_BEG => 20, Q_END => 21, S_END => 21 }, + ]; + + $space = { + Q_SEQ => \"ATCG", + S_SEQ => \"atcg", + Q_MIN => 10, + S_MIN => 10, + Q_MAX => 20, + S_MAX => 20, + }; + + Maasha::AlignTwoSeq::matches_select( $matches, $space ); + + ok( scalar @{ $matches } == 1 ); +} + + +sub test_word_size_calc +{ + ok( Maasha::AlignTwoSeq::word_size_calc( { Q_MIN => 0, S_MIN => 0, Q_MAX => 1, S_MAX => 1 } ) == 1 ); + ok( Maasha::AlignTwoSeq::word_size_calc( { Q_MIN => 0, S_MIN => 0, Q_MAX => 200, S_MAX => 200 } ) == 10 + 1 ); +} + + +sub test_seq_index +{ + my ( $space, $word_size, $index ); + + $space = { + Q_SEQ => \"ATCG", + S_SEQ => \"atcg", + Q_MIN => 0, + S_MIN => 0, + Q_MAX => 3, + S_MAX => 3, + }; + + $word_size = 2; + + $index = Maasha::AlignTwoSeq::seq_index( $space, $word_size ); + + ok( scalar keys %{ $index } == 3 ); + ok( exists $index->{ 'AT' } ); + ok( exists $index->{ 'TC' } ); + ok( exists $index->{ 'CG' } ); + ok( $index->{ 'AT' }->[ 0 ] == 0 ); + ok( $index->{ 'TC' }->[ 0 ] == 1 ); + ok( $index->{ 'CG' }->[ 0 ] == 2 ); +} + + +sub test_seq_scan +{ + my ( $space, $word_size, $index, $matches ); + + $space = { + Q_SEQ => \"ATCG", + S_SEQ => \"atcg", + Q_MIN => 0, + S_MIN => 0, + Q_MAX => 3, + S_MAX => 3, + }; + + $word_size = 2; + + $index = Maasha::AlignTwoSeq::seq_index( $space, $word_size ); + $matches = Maasha::AlignTwoSeq::seq_scan( $index, $space, $word_size ); + + ok( scalar @{ $matches } == 1 ); + ok( $matches->[ 0 ]->{ 'Q_BEG' } == 0 ); + ok( $matches->[ 0 ]->{ 'S_BEG' } == 0 ); + ok( $matches->[ 0 ]->{ 'Q_END' } == 3 ); + ok( $matches->[ 0 ]->{ 'S_END' } == 3 ); + ok( $matches->[ 0 ]->{ 'LEN' } == 4 ); + ok( $matches->[ 0 ]->{ 'SCORE' } == 0 ); +} + + +sub test_matches_find +{ + my ( $space, $matches ); + + $space = { Q_SEQ => \"ATCG", S_SEQ => \"ATCG", Q_MIN => 0, S_MIN => 0, Q_MAX => 3, S_MAX => 3 }; + + $matches = Maasha::AlignTwoSeq::matches_find( $space ); + + ok( scalar @{ $matches } == 1 ); + ok( $matches->[ 0 ]->{ 'Q_BEG' } == 0 ); + ok( $matches->[ 0 ]->{ 'S_BEG' } == 0 ); + ok( $matches->[ 0 ]->{ 'Q_END' } == 3 ); + ok( $matches->[ 0 ]->{ 'S_END' } == 3 ); + ok( $matches->[ 0 ]->{ 'LEN' } == 4 ); + + $space = { Q_SEQ => \"ATXXGAT", S_SEQ => \"ATCGAT", Q_MIN => 0, S_MIN => 0, Q_MAX => 6, S_MAX => 5 }; + + $matches = Maasha::AlignTwoSeq::matches_find( $space ); + + ok( scalar @{ $matches } == 4 ); + ok( $matches->[ 0 ]->{ 'Q_BEG' } == 0 ); + ok( $matches->[ 0 ]->{ 'S_BEG' } == 0 ); + ok( $matches->[ 0 ]->{ 'Q_END' } == 1 ); + ok( $matches->[ 0 ]->{ 'S_END' } == 1 ); + ok( $matches->[ 0 ]->{ 'LEN' } == 2 ); + ok( $matches->[ 1 ]->{ 'Q_BEG' } == 5 ); + ok( $matches->[ 1 ]->{ 'S_BEG' } == 0 ); + ok( $matches->[ 1 ]->{ 'Q_END' } == 6 ); + ok( $matches->[ 1 ]->{ 'S_END' } == 1 ); + ok( $matches->[ 1 ]->{ 'LEN' } == 2 ); + ok( $matches->[ 2 ]->{ 'Q_BEG' } == 4 ); + ok( $matches->[ 2 ]->{ 'S_BEG' } == 3 ); + ok( $matches->[ 2 ]->{ 'Q_END' } == 6 ); + ok( $matches->[ 2 ]->{ 'S_END' } == 5 ); + ok( $matches->[ 2 ]->{ 'LEN' } == 3 ); + ok( $matches->[ 3 ]->{ 'Q_BEG' } == 0 ); + ok( $matches->[ 3 ]->{ 'S_BEG' } == 4 ); + ok( $matches->[ 3 ]->{ 'Q_END' } == 1 ); + ok( $matches->[ 3 ]->{ 'S_END' } == 5 ); + ok( $matches->[ 3 ]->{ 'LEN' } == 2 ); +} + +sub test_match_expand_forward_end_space +{ + my ( $match, $space ); + + $match = { + Q_BEG => 1, + Q_END => 2, + S_BEG => 1, + S_END => 2, + SCORE => 0, + LEN => 2, + }; + + $space = { + Q_SEQ => \"ATCG", + S_SEQ => \"atcg", + Q_MIN => 0, + S_MIN => 0, + Q_MAX => 3, + S_MAX => 3, + }; + + Maasha::AlignTwoSeq::match_expand_forward( $match, $space ); + + ok( $match->{ 'Q_BEG' } == 1 ); + ok( $match->{ 'S_BEG' } == 1 ); + ok( $match->{ 'Q_END' } == 3 ); + ok( $match->{ 'S_END' } == 3 ); + ok( $match->{ 'LEN' } == 3 ); +} + + +sub test_match_expand_forward_end_match +{ + my ( $match, $space ); + + $match = { + Q_BEG => 1, + Q_END => 2, + S_BEG => 1, + S_END => 2, + SCORE => 0, + LEN => 2, + }; + + $space = { + Q_SEQ => \"ATCGXX", + S_SEQ => \"atcgnn", + Q_MIN => 0, + S_MIN => 0, + Q_MAX => 6, + S_MAX => 6, + }; + + Maasha::AlignTwoSeq::match_expand_forward( $match, $space ); + + ok( $match->{ 'Q_BEG' } == 1 ); + ok( $match->{ 'S_BEG' } == 1 ); + ok( $match->{ 'Q_END' } == 3 ); + ok( $match->{ 'S_END' } == 3 ); + ok( $match->{ 'LEN' } == 3 ); +} + + +sub test_match_expand_backward_end_space +{ + my ( $match, $space ); + + $match = { + Q_BEG => 1, + Q_END => 2, + S_BEG => 1, + S_END => 2, + SCORE => 0, + LEN => 2, + }; + + $space = { + Q_SEQ => \"ATCG", + S_SEQ => \"atcg", + Q_MIN => 0, + S_MIN => 0, + Q_MAX => 3, + S_MAX => 3, + }; + + Maasha::AlignTwoSeq::match_expand_backward( $match, $space ); + + ok( $match->{ 'Q_BEG' } == 0 ); + ok( $match->{ 'S_BEG' } == 0 ); + ok( $match->{ 'Q_END' } == 2 ); + ok( $match->{ 'S_END' } == 2 ); + ok( $match->{ 'LEN' } == 3 ); +} + + +sub test_match_expand_backward_end_match +{ + my ( $match, $space ); + + $match = { + Q_BEG => 2, + Q_END => 3, + S_BEG => 2, + S_END => 3, + SCORE => 0, + LEN => 2, + }; + + $space = { + Q_SEQ => \"XATCG", + S_SEQ => \"natcg", + Q_MIN => 0, + S_MIN => 0, + Q_MAX => 4, + S_MAX => 4, + }; + + Maasha::AlignTwoSeq::match_expand_backward( $match, $space ); + + ok( $match->{ 'Q_BEG' } == 1 ); + ok( $match->{ 'S_BEG' } == 1 ); + ok( $match->{ 'Q_END' } == 3 ); + ok( $match->{ 'S_END' } == 3 ); + ok( $match->{ 'LEN' } == 3 ); +} + + +sub test_match_expand_end_space +{ + my ( $match, $space ); + + $match = { + Q_BEG => 1, + Q_END => 2, + S_BEG => 1, + S_END => 2, + SCORE => 0, + LEN => 2, + }; + + $space = { + Q_SEQ => \"ATCG", + S_SEQ => \"atcg", + Q_MIN => 0, + S_MIN => 0, + Q_MAX => 3, + S_MAX => 3, + }; + + Maasha::AlignTwoSeq::match_expand( $match, $space ); + + ok( $match->{ 'Q_BEG' } == 0 ); + ok( $match->{ 'S_BEG' } == 0 ); + ok( $match->{ 'Q_END' } == 3 ); + ok( $match->{ 'S_END' } == 3 ); + ok( $match->{ 'LEN' } == 4 ); +} + + +sub test_match_expand_end_match +{ + my ( $match, $space ); + + $match = { + Q_BEG => 2, + Q_END => 3, + S_BEG => 2, + S_END => 3, + SCORE => 0, + LEN => 2, + }; + + $space = { + Q_SEQ => \"XATCGX", + S_SEQ => \"natcgn", + Q_MIN => 0, + S_MIN => 0, + Q_MAX => 6, + S_MAX => 6, + }; + + Maasha::AlignTwoSeq::match_expand( $match, $space ); + + ok( $match->{ 'Q_BEG' } == 1 ); + ok( $match->{ 'S_BEG' } == 1 ); + ok( $match->{ 'Q_END' } == 4 ); + ok( $match->{ 'S_END' } == 4 ); + ok( $match->{ 'LEN' } == 4 ); +} + + +sub test_match_redundant_add +{ + my ( $redundant ); + + $redundant = {}; + + Maasha::AlignTwoSeq::match_redundant_add( { Q_BEG => 10, Q_END => 20, S_BEG => 110, S_END => 120 }, $redundant ); + Maasha::AlignTwoSeq::match_redundant_add( { Q_BEG => 15, Q_END => 25, S_BEG => 210, S_END => 220 }, $redundant ); + + ok( scalar keys %{ $redundant } == 16 ); +} + + +sub test_match_redundant +{ + my ( $redundant ); + + $redundant = {}; + + Maasha::AlignTwoSeq::match_redundant_add( { Q_BEG => 10, Q_END => 20, S_BEG => 110, S_END => 120 }, $redundant ); + Maasha::AlignTwoSeq::match_redundant_add( { Q_BEG => 15, Q_END => 25, S_BEG => 210, S_END => 220 }, $redundant ); + + ok( Maasha::AlignTwoSeq::match_redundant( { Q_BEG => 10, Q_END => 20, S_BEG => 110, S_END => 120 }, $redundant ) ); + ok( not Maasha::AlignTwoSeq::match_redundant( { Q_BEG => 1, Q_END => 2, S_BEG => 110, S_END => 120 }, $redundant ) ); +} + + +sub test_matches_filter +{ + ok( 0 ); +} + + +sub test_match_score_narrow +{ + ok( 0 ); +} + + +sub test_match_score_diag +{ + ok( 0 ); +} + + +sub test_match_score_len +{ + ok( 0 ); +} + + +sub test_match_score +{ + ok( 0 ); +} + + +sub test_align_two_seq +{ + my ( $space, $matches ); + + $space = { + Q_SEQ => \"ATXCG", + S_SEQ => \"ATCG", + }; + + $matches = Maasha::AlignTwoSeq::align_two_seq( $space, [] ); + + print Dumper( $matches ); + + ok( 0 ); +} + + -- 2.39.2