-package Align;
+package Maasha::AlignTwoSeq;
# Copyright (C) 2007 Martin A. Hansen.
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-# yak yak yak
+# Routines to create a pair-wise alignment.
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+use warnings;
use strict;
use Data::Dumper;
-use Storable qw( dclone );
-use IPC::Open2;
use Maasha::Calc;
use Maasha::Seq;
use vars qw ( @ISA @EXPORT );
+@ISA = qw( Exporter );
+
use constant {
- Q_BEG => 0,
- Q_END => 1,
- S_BEG => 2,
- S_END => 3,
- LEN => 4,
- SCORE => 5,
- HEAD => 0,
- SEQ => 1,
+ SCORE_FACTOR_LEN => 3,
+ SCORE_FACTOR_CORNER => 2,
+ SCORE_FACTOR_DIAG => 1,
+ SCORE_FACTOR_NARROW => 0.5,
+ VERBOSE => 0,
};
-@ISA = qw( Exporter );
-
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
# Generates an alignment by chaining matches, which are subsequences
# shared between two sequences. The routine functions by considering
# only matches within a given search space. If no matches are given
- # these will be generated, if long matches are found these will be
- # included in the alignment, otherwise matches will be included depending
- # on a calculated score. New search spaces spanning the spaces between
- # matches and the search space boundaries will be cast and recursed into.
+ # these will be located and matches will be included depending on a
+ # calculated score. New search spaces spanning the spaces between
+ # matches and the search space boundaries will be cast and recursed
+ # into.
- my ( $q_seq, # sequence 1 ref
- $s_seq, # sequence 2 ref
+ # Usage: align_two_seq( { Q_SEQ => \$q_seq, S_SEQ => \$s_seq }, [] );
+
+ my ( $space, # search space
$matches, # list of matches
- $q_min, # q sequence start position
- $q_max, # q sequecne stop position
- $s_min, # s sequence start position
- $s_max, # s sequecne stop position
- $args, # argument hash
) = @_;
- # returns a chain of matches that can be chained into an alignment
-
- $matches ||= [];
- $q_min ||= 0;
- $s_min ||= 0;
- $q_max ||= length( ${ $q_seq } ) - 1;
- $s_max ||= length( ${ $s_seq } ) - 1;
+ # Returns a list.
- $args->{ "long_matches" } ||= 50;
- $args->{ "alph_len" } ||= 4;
+ my ( @chain, $best_match, $left_space, $right_space );
- my ( $wordsize, @chain, $match, $best_match, @long_matches );
+ new_space( $space );
- $matches = &select_matches( $matches, $q_min, $q_max, $s_min, $s_max );
+ matches_select( $matches, $space );
if ( scalar @{ $matches } == 0 ) # no matches - find some!
{
- # $wordsize = &find_wordsize( $q_min, $q_max, $s_min, $s_max, $args );
- $wordsize = 4;
- $matches = &find_matches( $q_seq, $s_seq, $wordsize, $q_min, $q_max, $s_min, $s_max );
+ $matches = matches_find( $space );
- while ( scalar @{ $matches } == 0 and $wordsize > 1 )
- {
- $wordsize--;
- $matches = &find_matches( $q_seq, $s_seq, $wordsize, $q_min, $q_max, $s_min, $s_max );
- }
-
if ( scalar @{ $matches } > 0 ) {
- push @chain, &align_two_seq( $q_seq, $s_seq, $matches, $q_min, $q_max, $s_min, $s_max, $args );
+ push @chain, align_two_seq( $space, $matches );
}
}
- elsif ( @long_matches = grep { $_->[ LEN ] >= $args->{ "long_matches" } } @{ $matches } ) # long matches found - include all that don't overlap!
+ else # matches are included according to score
{
- @long_matches = &order_matches( \@long_matches );
-
- foreach $match ( @long_matches )
- {
- push @chain, $match;
-
- if ( $match->[ Q_BEG ] - $q_min >= 2 and $match->[ S_BEG ] - $s_min >= 2 ) {
- push @chain, &align_two_seq( $q_seq, $s_seq, $matches, $q_min, $match->[ Q_BEG ] - 1, $s_min, $match->[ S_BEG ] - 1, $args ); # intermediate search space
- }
+ $matches = matches_filter( $space, $matches );
- $q_min = $match->[ Q_END ] + 1;
- $s_min = $match->[ S_END ] + 1;
- }
-
- if ( $q_min + 1 < $q_max and $s_min + 1 < $s_max ) {
- push @chain, &align_two_seq( $q_seq, $s_seq, $matches, $q_min, $q_max, $s_min, $s_max, $args ); # remaining search space
- }
- }
- else # shorter matches are included according to score
- {
- foreach $match ( @{ $matches } ) {
- # $match->[ SCORE ] = &score_match( $match, $q_min, $q_max, $s_min, $s_max );
- $match->[ SCORE ] = &score_match_niels( $match, $q_seq, $q_min, $q_max, $s_min, $s_max );
- }
-
- # @{ $matches } = grep { $_->[ SCORE ] > 0 } @{ $matches };
- @{ $matches } = grep { $_->[ SCORE ] <= 0.25 } @{ $matches };
- # @{ $matches } = sort { $b->[ SCORE ] <=> $a->[ SCORE ] } @{ $matches };
- @{ $matches } = sort { $a->[ SCORE ] <=> $b->[ SCORE ] } @{ $matches };
-
- $best_match = shift @{ $matches };
+ $best_match = shift @{ $matches };
if ( $best_match )
{
push @chain, $best_match;
- if ( $best_match->[ Q_BEG ] - $q_min >= 2 and $best_match->[ S_BEG ] - $s_min >= 2 ) {
- push @chain, &align_two_seq( $q_seq, $s_seq, $matches, $q_min, $best_match->[ Q_BEG ] - 1, $s_min, $best_match->[ S_BEG ] - 1, $args ); # left search space
- }
+ $left_space = new_space_left( $best_match, $space );
+ $right_space = new_space_right( $best_match, $space );
- if ( $q_max - $best_match->[ Q_END ] >= 2 and $s_max - $best_match->[ S_END ] >= 2 ) {
- push @chain, &align_two_seq( $q_seq, $s_seq, $matches, $best_match->[ Q_END ] + 1, $q_max, $best_match->[ S_END ] + 1, $s_max, $args ); # right search space
- }
+ push @chain, align_two_seq( $left_space, $matches ) if defined $left_space;
+ push @chain, align_two_seq( $right_space, $matches ) if defined $right_space;
}
}
}
-sub select_matches
+sub new_space
{
- # Martin A. Hansen, August 2006.
+ # Martin A. Hansen, August 2009.
- # Given a list of matches and a search space,
- # include only those matches contained within
- # this search space.
+ # Define a new search space if not previously
+ # defined. This occurs if align_two_seq() is
+ # called initially with only the sequences.
- my ( $matches, # list of matches
- $q_min, # q sequence start position
- $q_max, # q sequecne stop position
- $s_min, # s sequence start position
- $s_max, # s sequecne stop position
+ my ( $space, # search space
) = @_;
- # returns list of matches
+ # Returns nothing.
- my ( @matches );
+ if ( not defined $space->{ 'Q_MAX' } )
+ {
+ $space->{ 'Q_MIN' } = 0;
+ $space->{ 'S_MIN' } = 0;
+ $space->{ 'Q_MAX' } = length( ${ $space->{ 'Q_SEQ' } } ) - 1,
+ $space->{ 'S_MAX' } = length( ${ $space->{ 'S_SEQ' } } ) - 1,
+ }
+}
- @matches = grep { $_->[ Q_BEG ] >= $q_min and
- $_->[ S_BEG ] >= $s_min and
- $_->[ Q_END ] <= $q_max and
- $_->[ S_END ] <= $s_max } @{ $matches };
- return wantarray ? @matches : \@matches;
+sub new_space_left
+{
+ # Martin A. Hansen, August 2009.
+
+ # Create a new search space between the beginning of
+ # the current search space and the best match.
+
+ my ( $best_match, # best match
+ $space, # search space
+ ) = @_;
+
+ # Returns hashref.
+
+ my ( $new_space );
+
+ if ( $best_match->{ 'Q_BEG' } - $space->{ 'Q_MIN' } >= 2 and $best_match->{ 'S_BEG' } - $space->{ 'S_MIN' } >= 2 )
+ {
+ $new_space = {
+ Q_SEQ => $space->{ 'Q_SEQ' },
+ S_SEQ => $space->{ 'S_SEQ' },
+ Q_MIN => $space->{ 'Q_MIN' },
+ Q_MAX => $best_match->{ 'Q_BEG' } - 1,
+ S_MIN => $space->{ 'S_MIN' },
+ S_MAX => $best_match->{ 'S_BEG' } - 1,
+ };
+ }
+
+ return wantarray ? %{ $new_space } : $new_space;
}
-sub order_matches
+sub new_space_right
{
- # Martin A. Hansen, October 2006
+ # Martin A. Hansen, August 2009.
- # given a list of long matches, order these by length and position
- # and include only those long matches that does not cross.
+ # Create a new search space between the best match
+ # and the end of the current search space.
- my ( $long_matches, # list of matches
+ my ( $best_match, # best match
+ $space, # search space
) = @_;
- # returns a list of matches
-
- my ( @matches, $match, $i );
+ # Returns hashref.
- @{ $long_matches } = sort { $b->[ LEN ] <=> $a->[ LEN ] } @{ $long_matches };
-
- @matches = shift @{ $long_matches };
+ my ( $new_space );
- foreach $match ( @{ $long_matches } )
+ if ( $space->{ 'Q_MAX' } - $best_match->{ 'Q_END' } >= 2 and $space->{ 'S_MAX' } - $best_match->{ 'S_END' } >= 2 )
{
- if ( $match->[ Q_END ] < $matches[ 0 ]->[ Q_BEG ] and $match->[ S_END ] < $matches[ 0 ]->[ S_BEG ] )
- {
- unshift @matches, $match;
- }
- elsif ( $match->[ Q_BEG ] > $matches[ -1 ]->[ Q_END ] and $match->[ S_BEG ] > $matches[ -1 ]->[ S_END ] )
- {
- push @matches, $match;
- }
- else
- {
- for ( $i = 1; $i < @matches; $i++ )
- {
- if ( $matches[ $i - 1 ]->[ Q_END ] < $match->[ Q_BEG ] and $match->[ Q_END ] < $matches[ $i ]->[ Q_BEG ] and
- $matches[ $i - 1 ]->[ S_END ] < $match->[ S_BEG ] and $match->[ S_END ] < $matches[ $i ]->[ S_BEG ]
- )
- {
- splice @matches, $i, 0, dclone $match;
- last;
- }
- }
- }
+ $new_space = {
+ Q_SEQ => $space->{ 'Q_SEQ' },
+ S_SEQ => $space->{ 'S_SEQ' },
+ Q_MIN => $best_match->{ 'Q_END' } + 1,
+ Q_MAX => $space->{ 'Q_MAX' },
+ S_MIN => $best_match->{ 'S_END' } + 1,
+ S_MAX => $space->{ 'S_MAX' },
+ };
}
- return wantarray ? @matches : \@matches;
+ return wantarray ? %{ $new_space } : $new_space;
}
-sub find_wordsize
+sub matches_select
{
# Martin A. Hansen, August 2006.
+ # Given a list of matches and a search space,
+ # include only those matches contained within
+ # this search space.
+
+ my ( $matches, # list of matches
+ $space, # search space
+ ) = @_;
+
+ # Returns nothing.
+
+ @{ $matches } = grep { $_->{ 'Q_BEG' } >= $space->{ 'Q_MIN' } and
+ $_->{ 'S_BEG' } >= $space->{ 'S_MIN' } and
+ $_->{ 'Q_END' } <= $space->{ 'Q_MAX' } and
+ $_->{ 'S_END' } <= $space->{ 'S_MAX' } } @{ $matches };
+}
+
+
+sub word_size_calc
+{
+ # Martin A. Hansen, August 2008.
+
# Given a search space calculates the wordsize for a word so a match
# occurs only a few times. More matches may be needed at low similarity in
# order to avoid starting with a wrong match.
- my ( $q_min, # q sequence start position
- $q_max, # q sequecne stop position
- $s_min, # s sequence start position
- $s_max, # s sequecne stop position
- $args, # argument hash
+ my ( $space, # search space
) = @_;
- # returns integer
-
- my ( $q_dim, $s_dim, $dim_min, $wordsize );
+ # Returns integer.
- $q_dim = $q_max - $q_min + 1;
- $s_dim = $s_max - $s_min + 1;
+ my ( $factor, $word_size );
- $dim_min = &Maasha::Calc::min( $q_dim, $s_dim );
+ $factor = 0.05;
- $wordsize = 1;
-
- if ( $dim_min > 2000000 ) # optimized for DNA
- {
- $wordsize = $args->{ "long_matches" };
- }
- elsif ( $dim_min > 100000 ) # optimized for DNA
- {
- $wordsize = int( $args->{ "long_matches" } / 2 );
- }
- elsif ( $q_dim > 100 or $s_dim > 100 ) # optimized for DNA
- {
- while ( $args->{ "alph_len" } ** $wordsize <= $q_dim * $s_dim and $wordsize < $dim_min ) {
- $wordsize++;
- }
- }
- else
- {
- while ( $args->{ "alph_len" } ** $wordsize <= $dim_min and $wordsize < $dim_min ) {
- $wordsize++;
- }
+ if ( $space->{ 'Q_MAX' } - $space->{ 'Q_MIN' } < $space->{ 'S_MAX' } - $space->{ 'S_MIN' } ) {
+ $word_size = int( $factor * ( $space->{ 'Q_MAX' } - $space->{ 'Q_MIN' } ) ) + 1;
+ } else {
+ $word_size = int( $factor * ( $space->{ 'S_MAX' } - $space->{ 'S_MIN' } ) ) + 1;
}
- return $wordsize;
+ return $word_size;
}
-sub find_matches
+sub seq_index
{
- # Martin A. Hansen, November 2006
+ # Martin A. Hansen, August 2009.
- # given two sequences, find all maximum expanded matches between these
+ # Indexes a query sequence in a specified search space
+ # Overlapping words are saved as keys in an index hash,
+ # and a list of word postions are saved as value.
- my ( $q_seq, # sequence 1
- $s_seq, # sequence 2
- $wordsize, # word size
- $q_min, # q sequence start position
- $q_max, # q sequecne stop position
- $s_min, # s sequence start position
- $s_max, # s sequecne stop position
+ my ( $space, # search space
+ $word_size, # word size
) = @_;
- # returns list of matches
-
- my ( $q_beg, $q_word, %word_hash, $s_beg, $s_word, $match, @matches );
+ # Returns a hashref.
+
+ my ( $i, $word, %index );
- if ( length ${ $s_seq } > length ${ $q_seq } )
+ for ( $i = $space->{ 'Q_MIN' }; $i <= $space->{ 'Q_MAX' } - $word_size + 1; $i++ )
{
- for ( $q_beg = $q_min; $q_beg <= $q_max - $wordsize + 1; $q_beg++ )
- {
- $q_word = lc substr ${ $q_seq }, $q_beg, $wordsize;
+ $word = uc substr ${ $space->{ 'Q_SEQ' } }, $i, $word_size;
- next if $q_word =~ /n/i; # DNA/genome optimization
+ next if $word =~ tr/N//; # Skip sequences with Ns.
- push @{ $word_hash{ $q_word } }, $q_beg;
- }
+ push @{ $index{ $word } }, $i;
+ }
- for ( $s_beg = $s_min; $s_beg <= $s_max - $wordsize + 1; $s_beg++ )
- {
- $s_word = lc substr ${ $s_seq }, $s_beg, $wordsize;
+ return wantarray ? %index : \%index;
+}
- if ( exists $word_hash{ $s_word } )
- {
- foreach $q_beg ( @{ $word_hash{ $s_word } } )
- {
- $match = [ $q_beg, $q_beg + $wordsize - 1, $s_beg, $s_beg + $wordsize - 1 ];
-
- if ( grep { $match->[ Q_BEG ] >= $_->[ Q_BEG ] and
- $match->[ Q_END ] <= $_->[ Q_END ] and
- $match->[ S_BEG ] >= $_->[ S_BEG ] and
- $match->[ S_END ] <= $_->[ S_END ] } @matches )
- {
- next; # match is redundant
- }
- else
- {
- $match = &expand_match( $q_seq, $s_seq, $match, $q_max, $q_min, $s_max, $s_min );
- $match->[ LEN ] = $match->[ Q_END ] - $match->[ Q_BEG ] + 1;
-
- push @matches, $match;
- }
- }
- }
- }
- }
- else
- {
- for ( $s_beg = $s_min; $s_beg <= $s_max - $wordsize + 1; $s_beg++ )
- {
- $s_word = lc substr ${ $s_seq }, $s_beg, $wordsize;
- next if $s_word =~ /n/i; # DNA/genome optimization
+sub seq_scan
+{
+ # Martin A. Hansen, August 2009.
- push @{ $word_hash{ $s_word } }, $s_beg;
- }
+ my ( $index, # sequence index
+ $space, # search space
+ $word_size, # word size
+ ) = @_;
- for ( $q_beg = $q_min; $q_beg <= $q_max - $wordsize + 1; $q_beg++ )
- {
- $q_word = lc substr ${ $q_seq }, $q_beg, $wordsize;
+ # Returns a list.
+
+ my ( $i, $word, $pos, $match, @matches, $redundant );
+
+ $redundant = {};
+
+ for ( $i = $space->{ 'S_MIN' }; $i <= $space->{ 'S_MAX' } - $word_size + 1; $i++ )
+ {
+ $word = uc substr ${ $space->{ 'S_SEQ' } }, $i, $word_size;
- if ( exists $word_hash{ $q_word } )
+ if ( exists $index->{ $word } )
+ {
+ foreach $pos ( @{ $index->{ $word } } )
{
- foreach $s_beg ( @{ $word_hash{ $q_word } } )
+ $match = {
+ 'Q_BEG' => $pos,
+ 'Q_END' => $pos + $word_size - 1,
+ 'S_BEG' => $i,
+ 'S_END' => $i + $word_size - 1,
+ 'LEN' => $word_size,
+ 'SCORE' => 0,
+ };
+
+ if ( not match_redundant( $match, $redundant ) )
{
- $match = [ $q_beg, $q_beg + $wordsize - 1, $s_beg, $s_beg + $wordsize - 1 ];
-
- if ( grep { $match->[ Q_BEG ] >= $_->[ Q_BEG ] and
- $match->[ Q_END ] <= $_->[ Q_END ] and
- $match->[ S_BEG ] >= $_->[ S_BEG ] and
- $match->[ S_END ] <= $_->[ S_END ] } @matches )
- {
- next; # match is redundant
- }
- else
- {
- $match = &expand_match( $q_seq, $s_seq, $match, $q_max, $q_min, $s_max, $s_min );
- $match->[ LEN ] = $match->[ Q_END ] - $match->[ Q_BEG ] + 1;
-
- push @matches, $match;
- }
+ match_expand( $match, $space );
+
+ push @matches, $match;
+
+ match_redundant_add( $match, $redundant );
}
}
}
}
-sub expand_match
+sub matches_find
{
- # Martin A. Hansen, August 2006.
+ # Martin A. Hansen, November 2006
- # Given two sequences and a match, expand the match maximally.
- # A match is defined like this: [ Q_BEG, Q_END, S_BEG, S_END ]
+ # Given two sequences, find all maximum expanded matches between these.
- my ( $q_seq, # sequence 1 ref
- $s_seq, # sequence 2 ref
- $match, # sequence match
- $q_max, # q sequecne stop position
- $q_min, # q sequence start position
- $s_max, # s sequecne stop position
- $s_min, # s sequence start position
+ my ( $space, # search space
) = @_;
- # returns match
+ # returns list of matches
- my ( $q_pos, $s_pos );
+ my ( $word_size, $index, $matches );
- # expanding forward
+ $word_size = word_size_calc( $space );
- $q_pos = $match->[ Q_END ] + 1;
- $s_pos = $match->[ S_END ] + 1;
+ $matches = [];
- while ( $q_pos <= $q_max and $s_pos <= $s_max and lc substr( ${ $q_seq }, $q_pos, 1 ) eq lc substr( ${ $s_seq }, $s_pos, 1 ) )
+ while ( scalar @{ $matches } == 0 and $word_size > 0 )
{
- $q_pos++;
- $s_pos++;
+ $index = seq_index( $space, $word_size );
+ $matches = seq_scan( $index, $space, $word_size );
+
+ $word_size--;
}
- $match->[ Q_END ] = $q_pos - 1;
- $match->[ S_END ] = $s_pos - 1;
+ return wantarray ? @{ $matches } : $matches;
+}
- # expanding backwards
- $q_pos = $match->[ Q_BEG ] - 1;
- $s_pos = $match->[ S_BEG ] - 1;
+sub match_expand
+{
+ # Martin A. Hansen, August 2009.
- while ( $q_pos >= $q_min and $s_pos >= $s_min and lc substr( ${ $q_seq }, $q_pos, 1 ) eq lc substr( ${ $s_seq }, $s_pos, 1 ) )
- {
- $q_pos--;
- $s_pos--;
- }
+ # Given a match and a search space hash,
+ # expand the match maximally both forward and
+ # backward direction.
+
+ my ( $match, # match to expand
+ $space, # search space
+ ) = @_;
- $match->[ Q_BEG ] = $q_pos + 1;
- $match->[ S_BEG ] = $s_pos + 1;
+ # Returns nothing.
- return $match;
+ match_expand_forward( $match, $space );
+ match_expand_backward( $match, $space );
}
-sub score_match
+sub match_expand_forward
{
- # Martin A. Hansen, August 2006
-
- # given a match and a search space scores the match according to three criteria:
-
- # 1) length of match - the longer the better.
- # 2) distance to closest corner - the shorter the better.
- # 3) distance to closest narrow end of the search space - the shorter the better.
-
- # each of these scores are divided by search space dimentions, and the total score
- # is calculated: total = score_len - score_corner - score_narrow
-
- # the higher the score, the better the match.
-
- my ( $match, # match
- $q_min, # q sequence start position
- $q_max, # q sequecne stop position
- $s_min, # s sequence start position
- $s_max, # s sequecne stop position
- ) = @_;
-
- # returns a positive number
-
- my ( $q_dim, $s_dim, $score_len, $score_corner, $score_narrow, $score_total, $beg_diag_dist, $end_diag_dist,
- $min_diag_dist, $score_diag, $beg_narrow_dist, $end_narrow_dist, $max_narrow_dist );
+ # Martin A. Hansen, August 2009.
- # ----- 1) scoring according to match length
+ # Given a match and a search space hash,
+ # expand the match maximally in the forward
+ # direction.
- $score_len = $match->[ LEN ] ** 3;
-
- # ---- 2) score according to distance away from diagonal
+ my ( $match, # match to expand
+ $space, # search space
+ ) = @_;
- $q_dim = $q_max - $q_min + 1;
- $s_dim = $s_max - $s_min + 1;
+ # Returns nothing.
- if ( $q_dim >= $s_dim ) # s_dim is the narrow end
+ while ( $match->{ 'Q_END' } <= $space->{ 'Q_MAX' } and
+ $match->{ 'S_END' } <= $space->{ 'S_MAX' } and
+ uc substr( ${ $space->{ 'Q_SEQ' } }, $match->{ 'Q_END' }, 1 ) eq uc substr( ${ $space->{ 'S_SEQ' } }, $match->{ 'S_END' }, 1 ) )
{
- $beg_diag_dist = &Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_min, $s_min, $q_min + $s_dim, $s_min + $s_dim );
- $end_diag_dist = &Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_max - $s_dim, $s_max - $s_dim, $q_max, $s_max );
+ $match->{ 'Q_END' }++;
+ $match->{ 'S_END' }++;
+ $match->{ 'LEN' }++;
}
- else
+
+ $match->{ 'Q_END' }--;
+ $match->{ 'S_END' }--;
+ $match->{ 'LEN' }--;
+}
+
+
+sub match_expand_backward
+{
+ # Martin A. Hansen, August 2009.
+
+ # Given a match and a search space hash,
+ # expand the match maximally in the backward
+ # direction.
+
+ my ( $match, # match to expand
+ $space, # search space
+ ) = @_;
+
+ # Returns nothing.
+
+ while ( $match->{ 'Q_BEG' } >= $space->{ 'Q_MIN' } and
+ $match->{ 'S_BEG' } >= $space->{ 'S_MIN' } and
+ uc substr( ${ $space->{ 'Q_SEQ' } }, $match->{ 'Q_BEG' }, 1 ) eq uc substr( ${ $space->{ 'S_SEQ' } }, $match->{ 'S_BEG' }, 1 ) )
{
- $beg_diag_dist = &Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_min, $s_min, $q_min + $q_dim, $s_min + $q_dim );
- $end_diag_dist = &Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_max - $q_dim, $s_max - $q_dim, $q_max, $s_max );
+ $match->{ 'Q_BEG'}--;
+ $match->{ 'S_BEG'}--;
+ $match->{ 'LEN' }++;
}
- $min_diag_dist = &Maasha::Calc::min( $beg_diag_dist, $end_diag_dist );
+ $match->{ 'Q_BEG'}++;
+ $match->{ 'S_BEG'}++;
+ $match->{ 'LEN' }--;
+}
- $score_diag = 2 * $min_diag_dist ** 2;
- # ----- 3) scoring according to distance to the narrow end of the search space
+sub match_redundant
+{
+ # Martin A. Hansen, August 2009
+
+ my ( $new_match, # match
+ $redundant, # hashref
+ ) = @_;
- if ( $q_dim > $s_dim ) # s_dim is the narrow end
- {
- $beg_narrow_dist = $match->[ Q_BEG ] - $q_min;
- $end_narrow_dist = $q_max - $match->[ Q_BEG ];
+ # Returns boolean.
- $max_narrow_dist = &Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist );
- }
- elsif ( $q_dim < $s_dim )
- {
- $beg_narrow_dist = $match->[ S_BEG ] - $s_min;
- $end_narrow_dist = $s_max - $match->[ S_BEG ];
+ my ( $match );
- $max_narrow_dist = &Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist );
- }
- else
+ foreach $match ( @{ $redundant->{ $new_match->{ 'Q_BEG' } } } )
{
- $max_narrow_dist = 0;
+ if ( $new_match->{ 'S_BEG' } >= $match->{ 'S_BEG' } and $new_match->{ 'S_END' } <= $match->{ 'S_END' } ) {
+ return 1;
+ }
}
- $score_narrow = $max_narrow_dist;
+ return 0;
+}
+
+
+sub match_redundant_add
+{
+ # Martin A. Hansen, August 2009
- $score_total = $score_len - $score_narrow - $score_diag;
- # $score_total = -1 if 3 * $min_diag_dist > $match->[ LEN ];
+ my ( $match, # match
+ $redundant, # hash ref
+ ) = @_;
+
+ # Returns nothing.
- return $score_total;
+ map { push @{ $redundant->{ $_ } }, $match } ( $match->{ 'Q_BEG' } .. $match->{ 'Q_END' } );
}
-sub score_match_niels
+sub matches_filter
{
- # Niels Larsen, June 2004.
-
- # Creates a crude "heuristic" attempt of telling how likely it is that a
- # given match occurs by chance in a given search space. If sequences are
- # given their composition is taken into account. The scoring punishes
- # distance from diagonal(s) and distance from previous match(es). Scores
- # range from zero and up, and lower is better.
-
- my ( $match, # Match array
- $q_seq, # Either q_seq or s_seq
- $q_min, # Lower bound search area (query sequence)
- $q_max, # Upper bound search area (query sequence)
- $s_min, # Lower bound search area (subject sequence)
- $s_max, # Upper bound search area (subject sequence)
- ) = @_;
-
- # Returns a positive number.
-
- my ( $q_beg, $s_beg, $q_end, $s_end, $q_dim, $s_dim, $seq, $pos,
- $q_delta_beg, $s_delta_beg, $q_delta_end, $s_delta_end, $i,
- $delta_beg_max, $delta_end_max, $as, $gs, $ts, $cs, $pmatch,
- $score, $moves, $dist_beg, $dist_end, $seqlen, %chars, $delta,
- $delta_max );
-
- $q_beg = $match->[Q_BEG];
- $q_end = $match->[Q_END];
- $s_beg = $match->[S_BEG];
- $s_end = $match->[S_END];
-
- # >>>>>>>>>>>>>>>>>>>>>>> CRUDE INITIAL SCORE <<<<<<<<<<<<<<<<<<<<<<
+ # Martin A. Hansen, August 2009.
+
+ my ( $space, # search space
+ $matches, # list of matches
+ ) = @_;
- # Get the probability of a match from the sequence composition (when
- # match is longer than 20 and sequence is given, otherwise use 0.25)
- # and raise that to the power of the length.
+ # Returns a list.
- if ( $match->[LEN] > 20 and defined $q_seq )
- {
- $seq = substr ${ $q_seq }, $q_beg, $q_end-$q_beg+1;
- $seqlen = length $seq;
+ my ( $best_match, $match, @new_matches );
- $chars{"a"} = $chars{"g"} = $chars{"c"} = $chars{"t"} = 0;
+ $best_match = shift @{ $matches };
- for ( $i = 0; $i < $seqlen; $i++ )
+ match_score( $best_match, $space );
+
+ foreach $match ( @{ $matches } )
+ {
+ match_score( $match, $space );
+
+ if ( $match->{ 'SCORE' } > 0 )
{
- $chars{ substr $seq, $i, 1 }++;
+ if ( $match->{ 'SCORE' } > $best_match->{ 'SCORE' } ) {
+ $best_match = $match;
+ } else {
+ push @new_matches, $match;
+ }
}
-
- $pmatch = ($chars{"a"}/$seqlen)**2 + ($chars{"c"}/$seqlen)**2
- + ($chars{"g"}/$seqlen)**2 + ($chars{"t"}/$seqlen)**2;
- }
- else {
- $pmatch = 0.25;
}
- $score = $pmatch ** ( $q_end - $q_beg + 1 );
+ unshift @new_matches, $best_match if $best_match->{ 'SCORE' } > 0;
+
+ return wantarray ? @new_matches : \@new_matches;
+}
+
-# # Punish by difference in height and width of search space,
+sub match_score
+{
+ # Martin A. Hansen, August 2009
+
+ # Given a match and a search space scores the match according to three criteria:
-# $q_dim = $q_max - $q_min + 1;
-# $s_dim = $s_max - $s_min + 1;
+ # 1) length of match - the longer the better.
+ # 2) distance to closest corner - the shorter the better.
+ # 3) distance to closest narrow end of the search space - the shorter the better.
-# if ( $q_dim != $s_dim ) {
-# $score *= abs ( $q_dim - $s_dim ) ** 2;
-# }
+ # Each of these scores are divided by search space dimensions, and the total score
+ # is calculated: total = score_len - score_corner - score_narrow.
- return $score if $score > 0.25;
+ # The higher the score, the better the match.
- # Punish by how far the match is to the closest corner of the search
- # space,
+ my ( $match, # match
+ $space, # search space
+ ) = @_;
- $q_delta_beg = $q_beg - $q_min;
- $s_delta_beg = $s_beg - $s_min;
-
- $q_delta_end = $q_max - $q_end;
- $s_delta_end = $s_max - $s_end;
-
- if ( $q_delta_beg > $s_delta_beg ) {
- $delta_beg_max = $q_delta_beg;
- } else {
- $delta_beg_max = $s_delta_beg;
- }
+ # Returns nothing.
- if ( $q_delta_end > $s_delta_end ) {
- $delta_end_max = $q_delta_end;
- } else {
- $delta_end_max = $s_delta_end;
- }
+ my ( $score_len, $score_corner, $score_diag, $score_narrow, $score_total );
- if ( $delta_beg_max <= $delta_end_max ) {
- $score *= ($delta_beg_max+1) ** 2.0;
- } else {
- $score *= ($delta_end_max+1) ** 2.0;
- }
+ $score_len = match_score_len( $match );
+ $score_corner = match_score_corner( $match, $space );
+ $score_diag = match_score_diag( $match, $space );
+ $score_narrow = match_score_narrow( $match, $space );
- return $score if $score > 0.25;
+ $score_total = $score_len - $score_corner - $score_diag - $score_narrow;
- # Add penalty if the match is towards the narrow end of the
- # search space,
+ printf STDERR "LEN: %d CORNER: %d DIAG: %d NARROW: %d TOTAL: %d\n", $score_len, $score_corner, $score_diag, $score_narrow, $score_total if VERBOSE;
- if ( ($q_max - $q_min) <= ($s_max - $s_min) )
- {
- if ( $q_delta_beg > $s_delta_beg )
- {
- $score *= 2 * ( $q_delta_beg - $s_delta_beg ) ** 3;
- }
- elsif ( $q_delta_end > $s_delta_end )
- {
- $score *= 2 * ( $q_delta_end - $s_delta_end ) ** 3;
- }
- }
- else
- {
- if ( $s_delta_beg > $q_delta_beg )
- {
- $score *= 2 * ( $s_delta_beg - $q_delta_beg ) ** 3;
- }
- elsif ( $s_delta_end > $q_delta_end )
- {
- $score *= 2 * ( $s_delta_end - $q_delta_end ) ** 3;
- }
- }
+ $match->{ 'SCORE' } = $score_total;
+}
- if ( $score < 0 ) {
- print STDERR "q_min, q_max, s_min, s_max: $q_min, $q_max, $s_min, $s_max\n";
- die qq (Score <= 0 -> $score);
- }
- return $score;
+sub match_score_len
+{
+ # Martin A. Hansen, August 2009
+
+ # Score according to match length.
+
+ my ( $match, # match
+ ) = @_;
+
+ # Returns integer.
+
+ return $match->{ 'LEN' } * SCORE_FACTOR_LEN;
}
-sub print_alignment
+sub match_score_corner
{
- # Martin A. Hansen, August 2006.
+ # Martin A. Hansen, August 2009
- # Routine to print an alignment in fasta format based
- # on a list of matches and two given sequences.
+ # Score according to distance from corner.
- my ( $matches, # list of matches
- $q_head, # query sequence head
- $q_seq, # query sequence ref
- $s_head, # subject sequence head
- $s_seq, # subject sequence ref
+ my ( $match, # match
+ $space, # search space
) = @_;
- my ( $q_pos, $s_pos, $q_nomatch, $q_match, $s_nomatch, $match, $q_aseq, $s_aseq, $i );
+ # Returns integer.
- @{ $matches } = sort { $a->[ Q_BEG ] <=> $b->[ Q_BEG ] } @{ $matches };
+ my ( $left_dist, $right_dist, $score_corner );
- $q_pos = 0;
- $s_pos = 0;
+ $left_dist = Maasha::Calc::dist_point2point( $space->{ 'Q_MIN' }, $space->{ 'S_MIN' }, $match->{ 'Q_BEG' } , $match->{ 'S_BEG' } );
+ $right_dist = Maasha::Calc::dist_point2point( $space->{ 'Q_MAX' }, $space->{ 'S_MAX' }, $match->{ 'Q_END' } , $match->{ 'S_END' } );
- for ( $i = 0; $i < @{ $matches }; $i++ )
- {
- $match = $matches->[ $i ];
-
- $q_nomatch = $match->[ Q_BEG ] - $q_pos;
- $s_nomatch = $match->[ S_BEG ] - $s_pos;
+ $score_corner = Maasha::Calc::min( $left_dist, $right_dist );
+
+ return $score_corner * SCORE_FACTOR_CORNER;
+}
- if ( $q_nomatch - $s_nomatch > 0 )
- {
- $s_aseq .= "-" x ( $q_nomatch - $s_nomatch );
- $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ];
- $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ];
- }
- elsif ( $s_nomatch - $q_nomatch > 0 )
- {
- $q_aseq .= "-" x ( $s_nomatch - $q_nomatch );
- $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ];
- $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ];
- }
- else
- {
- $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ];
- $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ];
- }
-
- $q_pos += $q_nomatch + $match->[ LEN ];
- $s_pos += $s_nomatch + $match->[ LEN ];
- }
- $match = $matches->[ -1 ] || [ 0, 0, 0, 0, 0 ];
+sub match_score_diag
+{
+ # Martin A. Hansen, August 2009
- $q_nomatch = length( ${ $q_seq } ) - $match->[ Q_END ];
- $s_nomatch = length( ${ $s_seq } ) - $match->[ S_END ];
+ # Score according to distance from diagonal.
- if ( $q_nomatch - $s_nomatch > 0 )
- {
- $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ];
- $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ];
- $s_aseq .= "-" x ( $q_nomatch - $s_nomatch );
- }
- elsif ( $s_nomatch - $q_nomatch > 0 )
+ my ( $match, # match
+ $space, # search space
+ ) = @_;
+
+ # Returns integer.
+
+ my ( $q_dim, $s_dim, $beg_diag_dist, $end_diag_dist, $min_diag_dist, $score_diag );
+
+ $q_dim = $space->{ 'Q_MAX' } - $space->{ 'Q_MIN' } + 1;
+ $s_dim = $space->{ 'S_MAX' } - $space->{ 'S_MIN' } + 1;
+
+ if ( $q_dim >= $s_dim ) # s_dim is the narrow end
{
- $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ];
- $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ];
- $q_aseq .= "-" x ( $s_nomatch - $q_nomatch );
+ $beg_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' },
+ $match->{ 'S_BEG' },
+ $space->{ 'Q_MIN' },
+ $space->{ 'S_MIN' },
+ $space->{ 'Q_MIN' } + $s_dim,
+ $space->{ 'S_MIN' } + $s_dim );
+
+ $end_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' },
+ $match->{ 'S_BEG' },
+ $space->{ 'Q_MAX' } - $s_dim,
+ $space->{ 'S_MAX' } - $s_dim,
+ $space->{ 'Q_MAX' },
+ $space->{ 'S_MAX' } );
}
else
{
- $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ];
- $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ];
+ $beg_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' },
+ $match->{ 'S_BEG' },
+ $space->{ 'Q_MIN' },
+ $space->{ 'S_MIN' },
+ $space->{ 'Q_MIN' } + $q_dim,
+ $space->{ 'S_MIN' } + $q_dim );
+
+ $end_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' },
+ $match->{ 'S_BEG' },
+ $space->{ 'Q_MAX' } - $q_dim,
+ $space->{ 'S_MAX' } - $q_dim,
+ $space->{ 'Q_MAX' },
+ $space->{ 'S_MAX' } );
}
- print ">$q_head\n$q_aseq\n>$s_head\n$s_aseq\n";
+ $min_diag_dist = Maasha::Calc::min( $beg_diag_dist, $end_diag_dist );
+
+ $score_diag = $min_diag_dist;
+
+ return $score_diag * SCORE_FACTOR_DIAG;
}
-sub print_matches
+sub match_score_narrow
{
- # Martin A. Hansen, February 2007.
-
- my ( $matches, # list of matches
- $q_head, # query sequence head
- $q_seq, # query sequence ref
- $s_head, # subject sequence head
- $s_seq, # subject sequence ref
- $args, # argument hash - OPTIONAL
- ) = @_;
+ # Martin A. Hansen, August 2009
- $args->{ "wrap" } ||= 80;
+ # Score according to distance to the narrow end of the search space.
- my ( $q_pos, $s_pos, $match, $q_nomatch, $q_match, $s_nomatch, $q_aseq, $s_aseq, $pins, $i, $q, $s, $q_ruler, $s_ruler, $entries );
+ my ( $match, # match
+ $space, # search space
+ ) = @_;
- @{ $matches } = sort { $a->[ Q_BEG ] <=> $b->[ Q_BEG ] } @{ $matches };
+ # Returns integer.
- $q_pos = 0;
- $s_pos = 0;
+ my ( $max_narrow_dist, $q_dim, $s_dim, $beg_narrow_dist, $end_narrow_dist, $score_narrow );
- for ( $i = 0; $i < @{ $matches }; $i++ )
- {
- $match = $matches->[ $i ];
+ $max_narrow_dist = 0;
- $q_nomatch = $match->[ Q_BEG ] - $q_pos;
- $s_nomatch = $match->[ S_BEG ] - $s_pos;
+ $q_dim = $space->{ 'Q_MAX' } - $space->{ 'Q_MIN' } + 1;
+ $s_dim = $space->{ 'S_MAX' } - $space->{ 'S_MIN' } + 1;
- $q = $q_pos;
- $s = $s_pos;
+ if ( $q_dim > $s_dim ) # s_dim is the narrow end
+ {
+ $beg_narrow_dist = $match->{ 'Q_BEG' } - $space->{ 'Q_MIN' };
+ $end_narrow_dist = $space->{ 'Q_MAX' } - $match->{ 'Q_BEG' };
- if ( $q_nomatch - $s_nomatch > 0 )
- {
- $q_aseq .= substr ${ $q_seq }, $q_pos, ( $q_nomatch - $s_nomatch );
- $s_aseq .= "-" x ( $q_nomatch - $s_nomatch );
- $pins .= " " x ( $q_nomatch - $s_nomatch );
- $q += ( $q_nomatch - $s_nomatch );
- }
- elsif ( $s_nomatch - $q_nomatch > 0 )
- {
- $s_aseq .= substr ${ $s_seq }, $s_pos, ( $s_nomatch - $q_nomatch );
- $q_aseq .= "-" x ( $s_nomatch - $q_nomatch );
- $pins .= " " x ( $s_nomatch - $q_nomatch );
- $s += ( $s_nomatch - $q_nomatch );
- }
+ $max_narrow_dist = Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist );
+ }
+ elsif ( $q_dim < $s_dim ) # q_dim is the narrow end
+ {
+ $beg_narrow_dist = $match->{ 'S_BEG' } - $space->{ 'S_MIN' };
+ $end_narrow_dist = $space->{ 'S_MAX' } - $match->{ 'S_BEG' };
- while ( $q < $q_pos + $q_nomatch and $s < $s_pos + $s_nomatch )
- {
- $q_aseq .= substr ${ $q_seq }, $q, 1;
- $s_aseq .= substr ${ $s_seq }, $s, 1;
+ $max_narrow_dist = Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist );
+ }
- if ( substr( ${ $q_seq }, $q, 1 ) eq substr( ${ $s_seq }, $s, 1 ) )
- {
- $pins .= ":";
- } else {
- $pins .= " ";
- }
+ $score_narrow = $max_narrow_dist;
- $q++;
- $s++;
- }
+ return $score_narrow * SCORE_FACTOR_NARROW;
+}
- $q_aseq .= substr ${ $q_seq }, $match->[ Q_BEG ], $match->[ LEN ];
- $s_aseq .= substr ${ $s_seq }, $match->[ S_BEG ], $match->[ LEN ];
- $pins .= "|" x $match->[ LEN ];
- $q_pos += $q_nomatch + $match->[ LEN ];
- $s_pos += $s_nomatch + $match->[ LEN ];
- }
+sub shift_case
+{
+ # Martin A. Hansen, August 2009.
- $q_nomatch = length( ${ $q_seq } ) - ( $match->[ Q_END ] || 0 );
- $s_nomatch = length( ${ $s_seq } ) - ( $match->[ S_END ] || 0 );
+ # Given a list of matches and sequence references
+ # uppercase only matching sequnce while lowercasing the rest.
- $q = $q_pos;
- $s = $s_pos;
+ my ( $matches, # list of matches
+ $q_seq, # query sequence ref
+ $s_seq, # subject sequence ref
+ ) = @_;
- while ( $q < $q_pos + $q_nomatch and $q < length ${ $q_seq } and $s < $s_pos + $s_nomatch and $s < length ${ $s_seq } )
- {
- $q_aseq .= substr ${ $q_seq }, $q, 1;
- $s_aseq .= substr ${ $s_seq }, $s, 1;
+ # Returns nothing.
- if ( substr( ${ $q_seq }, $q, 1 ) eq substr( ${ $s_seq }, $s, 1 ) ) {
- $pins .= ":";
- } else {
- $pins .= " ";
- }
+ my ( $match );
- $q++;
- $s++;
- $q_pos++;
- $s_pos++;
- }
+ ${ $q_seq } = lc ${ $q_seq };
+ ${ $s_seq } = lc ${ $s_seq };
- if ( $q_nomatch - $s_nomatch > 0 )
+ foreach $match ( @{ $matches } )
{
- $q_aseq .= substr ${ $q_seq }, $q_pos, ( $q_nomatch - $s_nomatch );
- $s_aseq .= "-" x ( $q_nomatch - $s_nomatch );
- $pins .= " " x ( $q_nomatch - $s_nomatch );
+ substr ${ $q_seq }, $match->{ 'Q_BEG' }, $match->{ 'LEN' }, uc substr ${ $q_seq }, $match->{ 'Q_BEG' }, $match->{ 'LEN' };
+ substr ${ $s_seq }, $match->{ 'S_BEG' }, $match->{ 'LEN' }, uc substr ${ $s_seq }, $match->{ 'S_BEG' }, $match->{ 'LEN' };
}
- elsif ( $s_nomatch - $q_nomatch > 0 )
- {
- $s_aseq .= substr ${ $s_seq }, $s_pos, ( $s_nomatch - $q_nomatch );
- $q_aseq .= "-" x ( $s_nomatch - $q_nomatch );
- $pins .= " " x ( $s_nomatch - $q_nomatch );
- }
-
- $q_ruler = &make_ruler( $q_aseq );
- $s_ruler = &make_ruler( $s_aseq );
-
- $entries = [
- [ "ruler", $q_ruler ],
- [ $q_head, $q_aseq ],
- [ "consensus", $pins ],
- [ $s_head, $s_aseq ],
- [ "ruler", $s_ruler ],
- ];
-
- &align_print_multi( $entries, undef, $args->{ "wrap" } )
}
-sub make_ruler
+
+sub insert_indels
{
- # Martin A. Hansen, February 2007;
+ # Martin A. Hansen, August 2009.
- # Gererates a ruler for a given sequence (with indels).
+ # Given a list of matches and sequence references
+ # insert indels to form aligned sequences.
- my ( $seq
+ my ( $matches, # list of matches
+ $q_seq, # query sequence ref
+ $s_seq, # subject sequence ref
) = @_;
- # Returns string
-
- my ( $i, $char, $skip, $count, $gap, $tics );
+ my ( $i, $q_dist, $s_dist, $q_indels, $s_indels, $diff );
- $gap = 0;
- $i = 1;
+ $q_indels = 0;
+ $s_indels = 0;
- while ( $i <= length $seq )
+ if ( @{ $matches } > 0 )
{
- $char = substr $seq, $i - 1, 1;
-
- $gap++ if $char eq "-";
+ @{ $matches } = sort { $a->{ 'Q_BEG' } <=> $b->{ 'Q_BEG' } } @{ $matches }; # FIXME is this necessary?
+
+ # >>>>>>>>>>>>>> FIRST MATCH <<<<<<<<<<<<<<
+
+ $diff = $matches->[ 0 ]->{ 'Q_BEG' } - $matches->[ 0 ]->{ 'S_BEG' };
- if ( $skip )
+ if ( $diff > 0 )
{
- $skip--;
+ substr ${ $s_seq }, 0, 0, '-' x $diff;
+
+ $s_indels += $diff;
}
- else
+ elsif ( $diff < 0 )
+ {
+ substr ${ $q_seq }, 0, 0, '-' x abs $diff;
+
+ $q_indels += abs $diff;
+ }
+
+ # >>>>>>>>>>>>>> MIDDLE MATCHES <<<<<<<<<<<<<<
+
+ for ( $i = 0; $i < scalar @{ $matches } - 1; $i++ )
{
- $count = $i - $gap;
- $count = 1 if $char eq "-";
+ $q_dist = $matches->[ $i + 1 ]->{ 'Q_BEG' } - $matches->[ $i ]->{ 'Q_END' };
+ $s_dist = $matches->[ $i + 1 ]->{ 'S_BEG' } - $matches->[ $i ]->{ 'S_END' };
+
+ $diff = $q_dist - $s_dist;
- if ( $count % 100 == 0 )
+ if ( $diff > 0 )
{
- if ( $count + length( $count ) >= length $seq )
- {
- $tics .= "|";
- }
- else
- {
- $tics .= "|" . $count;
- $skip = length $count;
- }
+ substr ${ $s_seq }, $s_indels + $matches->[ $i ]->{ 'S_END' } + 1, 0, '-' x $diff;
+
+ $s_indels += $diff;
}
- elsif ( $count % 50 == 0 ) {
- $tics .= ":";
- } elsif ( $count % 10 == 0 ) {
- $tics .= ".";
- } else {
- $tics .= " ";
+ elsif ( $diff < 0 )
+ {
+ substr ${ $q_seq }, $q_indels + $matches->[ $i ]->{ 'Q_END' } + 1, 0, '-' x abs $diff;
+
+ $q_indels += abs $diff;
}
}
+ }
+
+ # >>>>>>>>>>>>>> LAST MATCH <<<<<<<<<<<<<<
+
+ $diff = length( ${ $q_seq } ) - length( ${ $s_seq } );
+
+ if ( $diff > 0 )
+ {
+ ${ $s_seq } .= '-' x $diff;
- $i++;
+ $s_indels += $diff;
}
+ else
+ {
+ ${ $q_seq } .= '-' x abs $diff;
- return $tics;
+ $q_indels += abs $diff;
+ }
}
+1;
+
+__END__
+
sub align_sim_local
{
$q_diff = $q_end - $q_beg + 1;
$s_diff = $s_end - $s_beg + 1;
- $max = &Maasha::Calc::max( $q_diff, $s_diff );
+ $max = Maasha::Calc::max( $q_diff, $s_diff );
$sim = sprintf( "%.2f", ( $match_tot / $max ) * 100 );
map { $match_tot += $_->[ LEN ] } @{ $chain };
- $min = &Maasha::Calc::min( length( ${ $q_seq } ), length( ${ $s_seq } ) );
+ $min = Maasha::Calc::min( length( ${ $q_seq } ), length( ${ $s_seq } ) );
$sim = sprintf( "%.2f", ( $match_tot / $min ) * 100 );
}
-sub align_consensus
-{
- # Martin A. Hansen, June 2006.
-
- # Given an alignment as a list of FASTA entries,
- # generates a consensus sequences based on the
- # entropies for each column similar to the way
- # a sequence logo i calculated. Returns the
- # consensus sequence as a FASTA entry.
-
- my ( $entries, # list of aligned FASTA entries
- $type, # residue type - OPTIONAL
- $min_sim, # minimum similarity - OPTIONAL
- ) = @_;
-
- # Returns tuple
-
- my ( $bit_max, $data, $pos, $char, $score, $entry );
-
- $type ||= &Maasha::Seq::seq_guess_type( $entries->[ 0 ] );
- $min_sim ||= 50;
-
- if ( $type =~ /protein/ ) {
- $bit_max = 4;
- } else {
- $bit_max = 2;
- }
-
- $data = &Maasha::Seq::seqlogo_calc( $bit_max, $entries );
-
- foreach $pos ( @{ $data } )
- {
- ( $char, $score ) = @{ $pos->[ -1 ] };
-
- if ( ( $score / $bit_max ) * 100 >= $min_sim ) {
- $entry->[ SEQ ] .= $char;
- } else {
- $entry->[ SEQ ] .= "-";
- }
- }
-
- $entry->[ HEAD ] = "Consensus: $min_sim%";
-
- return wantarray ? @{ $entry } : $entry;
-}
-
-
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<