-package Align;
+package Maasha::AlignTwoSeq;
# Copyright (C) 2007 Martin A. Hansen.
use strict;
use Data::Dumper;
use Storable qw( dclone );
-use IPC::Open2;
use Maasha::Calc;
use Maasha::Seq;
use vars qw ( @ISA @EXPORT );
use constant {
- Q_BEG => 0,
- Q_END => 1,
- S_BEG => 2,
- S_END => 3,
- LEN => 4,
- SCORE => 5,
- HEAD => 0,
- SEQ => 1,
+ Q_BEG => 0,
+ Q_END => 1,
+ S_BEG => 2,
+ S_END => 3,
+ LEN => 4,
+ SCORE => 5,
+ HEAD => 0,
+ SEQ => 1,
+ ALPH_LEN => 4,
};
@ISA = qw( Exporter );
$q_max, # q sequecne stop position
$s_min, # s sequence start position
$s_max, # s sequecne stop position
- $args, # argument hash
) = @_;
# returns a chain of matches that can be chained into an alignment
$q_max ||= length( ${ $q_seq } ) - 1;
$s_max ||= length( ${ $s_seq } ) - 1;
- $args->{ "long_matches" } ||= 50;
- $args->{ "alph_len" } ||= 4;
-
my ( $wordsize, @chain, $match, $best_match, @long_matches );
$matches = select_matches( $matches, $q_min, $q_max, $s_min, $s_max );
if ( scalar @{ $matches } == 0 ) # no matches - find some!
{
- # $wordsize = find_wordsize( $q_min, $q_max, $s_min, $s_max, $args );
- $wordsize = 4;
+ $wordsize = find_wordsize( $q_min, $q_max, $s_min, $s_max );
$matches = find_matches( $q_seq, $s_seq, $wordsize, $q_min, $q_max, $s_min, $s_max );
while ( scalar @{ $matches } == 0 and $wordsize > 1 )
}
if ( scalar @{ $matches } > 0 ) {
- push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $q_max, $s_min, $s_max, $args );
+ push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $q_max, $s_min, $s_max );
}
}
- elsif ( @long_matches = grep { $_->[ LEN ] >= $args->{ "long_matches" } } @{ $matches } ) # long matches found - include all that don't overlap!
- {
- @long_matches = order_matches( \@long_matches );
-
- foreach $match ( @long_matches )
- {
- push @chain, $match;
-
- if ( $match->[ Q_BEG ] - $q_min >= 2 and $match->[ S_BEG ] - $s_min >= 2 ) {
- push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $match->[ Q_BEG ] - 1, $s_min, $match->[ S_BEG ] - 1, $args ); # intermediate search space
- }
-
- $q_min = $match->[ Q_END ] + 1;
- $s_min = $match->[ S_END ] + 1;
- }
-
- if ( $q_min + 1 < $q_max and $s_min + 1 < $s_max ) {
- push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $q_max, $s_min, $s_max, $args ); # remaining search space
- }
- }
- else # shorter matches are included according to score
+ else # matches are included according to score
{
foreach $match ( @{ $matches } ) {
- # $match->[ SCORE ] = score_match( $match, $q_min, $q_max, $s_min, $s_max );
- $match->[ SCORE ] = score_match_niels( $match, $q_seq, $q_min, $q_max, $s_min, $s_max );
+ $match->[ SCORE ] = score_match( $match, $q_min, $q_max, $s_min, $s_max );
}
- # @{ $matches } = grep { $_->[ SCORE ] > 0 } @{ $matches };
- @{ $matches } = grep { $_->[ SCORE ] <= 0.25 } @{ $matches };
- # @{ $matches } = sort { $b->[ SCORE ] <=> $a->[ SCORE ] } @{ $matches };
- @{ $matches } = sort { $a->[ SCORE ] <=> $b->[ SCORE ] } @{ $matches };
+ @{ $matches } = grep { $_->[ SCORE ] > 0 } @{ $matches };
+ @{ $matches } = sort { $b->[ SCORE ] <=> $a->[ SCORE ] } @{ $matches };
$best_match = shift @{ $matches };
push @chain, $best_match;
if ( $best_match->[ Q_BEG ] - $q_min >= 2 and $best_match->[ S_BEG ] - $s_min >= 2 ) {
- push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $best_match->[ Q_BEG ] - 1, $s_min, $best_match->[ S_BEG ] - 1, $args ); # left search space
+ push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $best_match->[ Q_BEG ] - 1, $s_min, $best_match->[ S_BEG ] - 1 ); # left search space
}
if ( $q_max - $best_match->[ Q_END ] >= 2 and $s_max - $best_match->[ S_END ] >= 2 ) {
- push @chain, align_two_seq( $q_seq, $s_seq, $matches, $best_match->[ Q_END ] + 1, $q_max, $best_match->[ S_END ] + 1, $s_max, $args ); # right search space
+ push @chain, align_two_seq( $q_seq, $s_seq, $matches, $best_match->[ Q_END ] + 1, $q_max, $best_match->[ S_END ] + 1, $s_max ); # right search space
}
}
}
}
-sub order_matches
-{
- # Martin A. Hansen, October 2006
-
- # given a list of long matches, order these by length and position
- # and include only those long matches that does not cross.
-
- my ( $long_matches, # list of matches
- ) = @_;
-
- # returns a list of matches
-
- my ( @matches, $match, $i );
-
- @{ $long_matches } = sort { $b->[ LEN ] <=> $a->[ LEN ] } @{ $long_matches };
-
- @matches = shift @{ $long_matches };
-
- foreach $match ( @{ $long_matches } )
- {
- if ( $match->[ Q_END ] < $matches[ 0 ]->[ Q_BEG ] and $match->[ S_END ] < $matches[ 0 ]->[ S_BEG ] )
- {
- unshift @matches, $match;
- }
- elsif ( $match->[ Q_BEG ] > $matches[ -1 ]->[ Q_END ] and $match->[ S_BEG ] > $matches[ -1 ]->[ S_END ] )
- {
- push @matches, $match;
- }
- else
- {
- for ( $i = 1; $i < @matches; $i++ )
- {
- if ( $matches[ $i - 1 ]->[ Q_END ] < $match->[ Q_BEG ] and $match->[ Q_END ] < $matches[ $i ]->[ Q_BEG ] and
- $matches[ $i - 1 ]->[ S_END ] < $match->[ S_BEG ] and $match->[ S_END ] < $matches[ $i ]->[ S_BEG ]
- )
- {
- splice @matches, $i, 0, dclone $match;
- last;
- }
- }
- }
- }
-
- return wantarray ? @matches : \@matches;
-}
-
-
sub find_wordsize
{
# Martin A. Hansen, August 2006.
$q_max, # q sequecne stop position
$s_min, # s sequence start position
$s_max, # s sequecne stop position
- $args, # argument hash
) = @_;
# returns integer
$wordsize = 1;
- if ( $dim_min > 2000000 ) # optimized for DNA
+ if ( $dim_min > 2000000 ) # optimized for DNA
{
- $wordsize = $args->{ "long_matches" };
+ $wordsize = 50;
}
- elsif ( $dim_min > 100000 ) # optimized for DNA
+ elsif ( $dim_min > 100000 ) # optimized for DNA
{
- $wordsize = int( $args->{ "long_matches" } / 2 );
+ $wordsize = 25;
}
elsif ( $q_dim > 100 or $s_dim > 100 ) # optimized for DNA
{
- while ( $args->{ "alph_len" } ** $wordsize <= $q_dim * $s_dim and $wordsize < $dim_min ) {
+ while ( ALPH_LEN ** $wordsize <= $q_dim * $s_dim and $wordsize < $dim_min ) {
$wordsize++;
}
}
else
{
- while ( $args->{ "alph_len" } ** $wordsize <= $dim_min and $wordsize < $dim_min ) {
+ while ( ALPH_LEN ** $wordsize <= $dim_min and $wordsize < $dim_min ) {
$wordsize++;
}
}
$score_narrow = $max_narrow_dist;
$score_total = $score_len - $score_narrow - $score_diag;
- # $score_total = -1 if 3 * $min_diag_dist > $match->[ LEN ];
return $score_total;
}
}
-sub print_alignment
+sub insert_indels
{
- # Martin A. Hansen, August 2006.
+ # Martin A. Hansen, June 2009.
- # Routine to print an alignment in fasta format based
- # on a list of matches and two given sequences.
+ # Given a list of matches and references to q_seq and s_seq,
+ # insert indels to form aligned sequences.
- my ( $matches, # list of matches
- $q_head, # query sequence head
- $q_seq, # query sequence ref
- $s_head, # subject sequence head
- $s_seq, # subject sequence ref
+ my ( $matches, # list of matches
+ $q_seq, # ref to query sequence
+ $s_seq, # ref to subject sequence
) = @_;
- my ( $q_pos, $s_pos, $q_nomatch, $q_match, $s_nomatch, $match, $q_aseq, $s_aseq, $i );
+ # Returns nothing.
+
+ my ( $q_indels, $s_indels, $match, $diff, $first );
@{ $matches } = sort { $a->[ Q_BEG ] <=> $b->[ Q_BEG ] } @{ $matches };
- $q_pos = 0;
- $s_pos = 0;
+ $first = 1;
+ $q_indels = 0;
+ $s_indels = 0;
- for ( $i = 0; $i < @{ $matches }; $i++ )
- {
- $match = $matches->[ $i ];
-
- $q_nomatch = $match->[ Q_BEG ] - $q_pos;
- $s_nomatch = $match->[ S_BEG ] - $s_pos;
+ # ---- initial indels ----
- if ( $q_nomatch - $s_nomatch > 0 )
- {
- $s_aseq .= "-" x ( $q_nomatch - $s_nomatch );
- $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ];
- $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ];
- }
- elsif ( $s_nomatch - $q_nomatch > 0 )
- {
- $q_aseq .= "-" x ( $s_nomatch - $q_nomatch );
- $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ];
- $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ];
- }
- else
- {
- $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ];
- $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ];
- }
-
- $q_pos += $q_nomatch + $match->[ LEN ];
- $s_pos += $s_nomatch + $match->[ LEN ];
- }
+ $match = shift @{ $matches };
- $match = $matches->[ -1 ] || [ 0, 0, 0, 0, 0 ];
+ substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ], uc substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ];
+ substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ], uc substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ];
- $q_nomatch = length( ${ $q_seq } ) - $match->[ Q_END ];
- $s_nomatch = length( ${ $s_seq } ) - $match->[ S_END ];
+ $diff = ( $match->[ Q_BEG ] + $q_indels ) - ( $match->[ S_BEG ] + $s_indels );
- if ( $q_nomatch - $s_nomatch > 0 )
+ if ( $diff < 0 )
{
- $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ];
- $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ];
- $s_aseq .= "-" x ( $q_nomatch - $s_nomatch );
- }
- elsif ( $s_nomatch - $q_nomatch > 0 )
- {
- $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ];
- $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ];
- $q_aseq .= "-" x ( $s_nomatch - $q_nomatch );
+ substr ${ $q_seq }, 0, 0, '-' x abs $diff;
+
+ $q_indels += abs $diff;
}
- else
+ elsif ( $diff > 0 )
{
- $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ];
- $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ];
- }
-
- print ">$q_head\n$q_aseq\n>$s_head\n$s_aseq\n";
-}
-
-
-sub print_matches
-{
- # Martin A. Hansen, February 2007.
-
- my ( $matches, # list of matches
- $q_head, # query sequence head
- $q_seq, # query sequence ref
- $s_head, # subject sequence head
- $s_seq, # subject sequence ref
- $args, # argument hash - OPTIONAL
- ) = @_;
+ substr ${ $s_seq }, 0, 0, '-' x abs $diff;
- $args->{ "wrap" } ||= 80;
-
- my ( $q_pos, $s_pos, $match, $q_nomatch, $q_match, $s_nomatch, $q_aseq, $s_aseq, $pins, $i, $q, $s, $q_ruler, $s_ruler, $entries );
-
- @{ $matches } = sort { $a->[ Q_BEG ] <=> $b->[ Q_BEG ] } @{ $matches };
+ $s_indels += abs $diff;
+ }
- $q_pos = 0;
- $s_pos = 0;
+ # ---- internal indels ----
- for ( $i = 0; $i < @{ $matches }; $i++ )
+ foreach $match ( @{ $matches } )
{
- $match = $matches->[ $i ];
+ substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ], uc substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ];
+ substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ], uc substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ];
- $q_nomatch = $match->[ Q_BEG ] - $q_pos;
- $s_nomatch = $match->[ S_BEG ] - $s_pos;
+ $diff = ( $match->[ Q_BEG ] + $q_indels ) - ( $match->[ S_BEG ] + $s_indels );
- $q = $q_pos;
- $s = $s_pos;
-
- if ( $q_nomatch - $s_nomatch > 0 )
- {
- $q_aseq .= substr ${ $q_seq }, $q_pos, ( $q_nomatch - $s_nomatch );
- $s_aseq .= "-" x ( $q_nomatch - $s_nomatch );
- $pins .= " " x ( $q_nomatch - $s_nomatch );
- $q += ( $q_nomatch - $s_nomatch );
- }
- elsif ( $s_nomatch - $q_nomatch > 0 )
+ if ( $diff < 0 )
{
- $s_aseq .= substr ${ $s_seq }, $s_pos, ( $s_nomatch - $q_nomatch );
- $q_aseq .= "-" x ( $s_nomatch - $q_nomatch );
- $pins .= " " x ( $s_nomatch - $q_nomatch );
- $s += ( $s_nomatch - $q_nomatch );
- }
+ substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, 0, '-' x abs $diff;
- while ( $q < $q_pos + $q_nomatch and $s < $s_pos + $s_nomatch )
- {
- $q_aseq .= substr ${ $q_seq }, $q, 1;
- $s_aseq .= substr ${ $s_seq }, $s, 1;
-
- if ( substr( ${ $q_seq }, $q, 1 ) eq substr( ${ $s_seq }, $s, 1 ) )
- {
- $pins .= ":";
- } else {
- $pins .= " ";
- }
-
- $q++;
- $s++;
+ $q_indels += abs $diff;
}
+ elsif ( $diff > 0 )
+ {
+ substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, 0, '-' x abs $diff;
- $q_aseq .= substr ${ $q_seq }, $match->[ Q_BEG ], $match->[ LEN ];
- $s_aseq .= substr ${ $s_seq }, $match->[ S_BEG ], $match->[ LEN ];
- $pins .= "|" x $match->[ LEN ];
-
- $q_pos += $q_nomatch + $match->[ LEN ];
- $s_pos += $s_nomatch + $match->[ LEN ];
- }
-
- $q_nomatch = length( ${ $q_seq } ) - ( $match->[ Q_END ] || 0 );
- $s_nomatch = length( ${ $s_seq } ) - ( $match->[ S_END ] || 0 );
-
- $q = $q_pos;
- $s = $s_pos;
-
- while ( $q < $q_pos + $q_nomatch and $q < length ${ $q_seq } and $s < $s_pos + $s_nomatch and $s < length ${ $s_seq } )
- {
- $q_aseq .= substr ${ $q_seq }, $q, 1;
- $s_aseq .= substr ${ $s_seq }, $s, 1;
-
- if ( substr( ${ $q_seq }, $q, 1 ) eq substr( ${ $s_seq }, $s, 1 ) ) {
- $pins .= ":";
- } else {
- $pins .= " ";
+ $s_indels += abs $diff;
}
-
- $q++;
- $s++;
- $q_pos++;
- $s_pos++;
- }
-
- if ( $q_nomatch - $s_nomatch > 0 )
- {
- $q_aseq .= substr ${ $q_seq }, $q_pos, ( $q_nomatch - $s_nomatch );
- $s_aseq .= "-" x ( $q_nomatch - $s_nomatch );
- $pins .= " " x ( $q_nomatch - $s_nomatch );
- }
- elsif ( $s_nomatch - $q_nomatch > 0 )
- {
- $s_aseq .= substr ${ $s_seq }, $s_pos, ( $s_nomatch - $q_nomatch );
- $q_aseq .= "-" x ( $s_nomatch - $q_nomatch );
- $pins .= " " x ( $s_nomatch - $q_nomatch );
}
- $q_ruler = make_ruler( $q_aseq );
- $s_ruler = make_ruler( $s_aseq );
+ # ---- last indels ----
- $entries = [
- [ "ruler", $q_ruler ],
- [ $q_head, $q_aseq ],
- [ "consensus", $pins ],
- [ $s_head, $s_aseq ],
- [ "ruler", $s_ruler ],
- ];
+ $diff = length( ${ $s_seq } ) - length( ${ $q_seq } );
- align_print_multi( $entries, undef, $args->{ "wrap" } )
-}
-
-
-sub make_ruler
-{
- # Martin A. Hansen, February 2007;
-
- # Gererates a ruler for a given sequence (with indels).
-
- my ( $seq
- ) = @_;
-
- # Returns string
-
- my ( $i, $char, $skip, $count, $gap, $tics );
-
- $gap = 0;
- $i = 1;
-
- while ( $i <= length $seq )
- {
- $char = substr $seq, $i - 1, 1;
-
- $gap++ if $char eq "-";
-
- if ( $skip )
- {
- $skip--;
- }
- else
- {
- $count = $i - $gap;
- $count = 1 if $char eq "-";
-
- if ( $count % 100 == 0 )
- {
- if ( $count + length( $count ) >= length $seq )
- {
- $tics .= "|";
- }
- else
- {
- $tics .= "|" . $count;
- $skip = length $count;
- }
- }
- elsif ( $count % 50 == 0 ) {
- $tics .= ":";
- } elsif ( $count % 10 == 0 ) {
- $tics .= ".";
- } else {
- $tics .= " ";
- }
- }
-
- $i++;
+ if ( $diff > 0 ) {
+ ${ $q_seq } .= '-' x abs $diff;
+ } else {
+ ${ $s_seq } .= '-' x abs $diff;
}
-
- return $tics;
}