1 package Maasha::AlignTwoSeq;
3 # Copyright (C) 2007 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Routines to create a pair-wise alignment.
28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
36 use vars qw ( @ISA @EXPORT );
38 @ISA = qw( Exporter );
41 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
46 # Martin A. Hansen, August 2006.
48 # Generates an alignment by chaining matches, which are subsequences
49 # shared between two sequences. The routine functions by considering
50 # only matches within a given search space. If no matches are given
51 # these will be generated, if long matches are found these will be
52 # included in the alignment, otherwise matches will be included depending
53 # on a calculated score. New search spaces spanning the spaces between
54 # matches and the search space boundaries will be cast and recursed into.
56 # Usage: align_two_seq( { Q_SEQ => \$q_seq, S_SEQ => \$s_seq }, [] );
58 my ( $space, # search space
59 $matches, # list of matches
64 my ( @chain, $best_match, $left_space, $right_space );
68 matches_select( $matches, $space );
70 if ( scalar @{ $matches } == 0 ) # no matches - find some!
72 $matches = matches_find( $space );
74 if ( scalar @{ $matches } > 0 ) {
75 push @chain, align_two_seq( $space, $matches );
78 else # matches are included according to score
80 $matches = matches_filter( $space, $matches );
82 $best_match = shift @{ $matches };
86 push @chain, $best_match;
88 $left_space = new_space_left( $best_match, $space );
89 $right_space = new_space_right( $best_match, $space );
91 push @chain, align_two_seq( $left_space, $matches ) if defined $left_space;
92 push @chain, align_two_seq( $right_space, $matches ) if defined $right_space;
96 return wantarray ? @chain : \@chain;
102 # Martin A. Hansen, August 2009.
104 # Define a new search space if not previously
105 # defined. This occurs if align_two_seq() is
106 # called initially with only the sequences.
108 my ( $space, # search space
113 if ( not defined $space->{ 'Q_MAX' } )
115 $space->{ 'Q_MIN' } = 0;
116 $space->{ 'S_MIN' } = 0;
117 $space->{ 'Q_MAX' } = length( ${ $space->{ 'Q_SEQ' } } ) - 1,
118 $space->{ 'S_MAX' } = length( ${ $space->{ 'S_SEQ' } } ) - 1,
125 # Martin A. Hansen, August 2009.
127 # Create a new search space between the beginning of
128 # the current search space and the best match.
130 my ( $best_match, # best match
131 $space, # search space
138 if ( $best_match->{ 'Q_BEG' } - $space->{ 'Q_MIN' } >= 2 and $best_match->{ 'S_BEG' } - $space->{ 'S_MIN' } >= 2 )
141 Q_SEQ => $space->{ 'Q_SEQ' },
142 S_SEQ => $space->{ 'S_SEQ' },
143 Q_MIN => $space->{ 'Q_MIN' },
144 Q_MAX => $best_match->{ 'Q_BEG' } - 1,
145 S_MIN => $space->{ 'S_MIN' },
146 S_MAX => $best_match->{ 'S_BEG' } - 1,
150 return wantarray ? %{ $new_space } : $new_space;
156 # Martin A. Hansen, August 2009.
158 # Create a new search space between the best match
159 # and the end of the current search space.
161 my ( $best_match, # best match
162 $space, # search space
169 if ( $space->{ 'Q_MAX' } - $best_match->{ 'Q_END' } >= 2 and $space->{ 'S_MAX' } - $best_match->{ 'S_END' } >= 2 )
172 Q_SEQ => $space->{ 'Q_SEQ' },
173 S_SEQ => $space->{ 'S_SEQ' },
174 Q_MIN => $best_match->{ 'Q_END' } + 1,
175 Q_MAX => $space->{ 'Q_MAX' },
176 S_MIN => $best_match->{ 'S_END' } + 1,
177 S_MAX => $space->{ 'S_MAX' },
181 return wantarray ? %{ $new_space } : $new_space;
187 # Martin A. Hansen, August 2006.
189 # Given a list of matches and a search space,
190 # include only those matches contained within
193 my ( $matches, # list of matches
194 $space, # search space
199 @{ $matches } = grep { $_->{ 'Q_BEG' } >= $space->{ 'Q_MIN' } and
200 $_->{ 'S_BEG' } >= $space->{ 'S_MIN' } and
201 $_->{ 'Q_END' } <= $space->{ 'Q_MAX' } and
202 $_->{ 'S_END' } <= $space->{ 'S_MAX' } } @{ $matches };
208 # Martin A. Hansen, August 2008.
210 # Given a search space calculates the wordsize for a word so a match
211 # occurs only a few times. More matches may be needed at low similarity in
212 # order to avoid starting with a wrong match.
214 my ( $space, # search space
219 my ( $factor, $word_size );
223 if ( $space->{ 'Q_MAX' } - $space->{ 'Q_MIN' } < $space->{ 'S_MAX' } - $space->{ 'S_MIN' } ) {
224 $word_size = int( $factor * ( $space->{ 'Q_MAX' } - $space->{ 'Q_MIN' } ) ) + 1;
226 $word_size = int( $factor * ( $space->{ 'S_MAX' } - $space->{ 'S_MIN' } ) ) + 1;
235 # Martin A. Hansen, August 2009.
237 # Indexes a query sequence in a specified search space
238 # Overlapping words are saved as keys in an index hash,
239 # and a list of word postions are saved as value.
241 my ( $space, # search space
242 $word_size, # word size
247 my ( $i, $word, %index );
249 for ( $i = $space->{ 'Q_MIN' }; $i <= $space->{ 'Q_MAX' } - $word_size + 1; $i++ )
251 $word = uc substr ${ $space->{ 'Q_SEQ' } }, $i, $word_size;
253 next if $word =~ tr/N//; # Skip sequences with Ns.
255 push @{ $index{ $word } }, $i;
258 return wantarray ? %index : \%index;
264 # Martin A. Hansen, August 2009.
266 my ( $index, # sequence index
267 $space, # search space
268 $word_size, # word size
273 my ( $i, $word, $pos, $match, @matches, $redundant );
277 for ( $i = $space->{ 'S_MIN' }; $i <= $space->{ 'S_MAX' } - $word_size + 1; $i++ )
279 $word = uc substr ${ $space->{ 'S_SEQ' } }, $i, $word_size;
281 if ( exists $index->{ $word } )
283 foreach $pos ( @{ $index->{ $word } } )
287 'Q_END' => $pos + $word_size - 1,
289 'S_END' => $i + $word_size - 1,
294 if ( not match_redundant( $match, $redundant ) )
296 match_expand( $match, $space );
298 push @matches, $match;
300 match_redundant_add( $match, $redundant );
306 return wantarray ? @matches : \@matches;
312 # Martin A. Hansen, November 2006
314 # Given two sequences, find all maximum expanded matches between these.
316 my ( $space, # search space
319 # returns list of matches
321 my ( $word_size, $index, $matches );
323 $word_size = word_size_calc( $space );
327 while ( scalar @{ $matches } == 0 and $word_size > 0 )
329 $index = seq_index( $space, $word_size );
330 $matches = seq_scan( $index, $space, $word_size );
335 return wantarray ? @{ $matches } : $matches;
341 # Martin A. Hansen, August 2009.
343 # Given a match and a search space hash,
344 # expand the match maximally both forward and
345 # backward direction.
347 my ( $match, # match to expand
348 $space, # search space
353 match_expand_forward( $match, $space );
354 match_expand_backward( $match, $space );
358 sub match_expand_forward
360 # Martin A. Hansen, August 2009.
362 # Given a match and a search space hash,
363 # expand the match maximally in the forward
366 my ( $match, # match to expand
367 $space, # search space
372 while ( $match->{ 'Q_END' } <= $space->{ 'Q_MAX' } and
373 $match->{ 'S_END' } <= $space->{ 'S_MAX' } and
374 uc substr( ${ $space->{ 'Q_SEQ' } }, $match->{ 'Q_END' }, 1 ) eq uc substr( ${ $space->{ 'S_SEQ' } }, $match->{ 'S_END' }, 1 ) )
376 $match->{ 'Q_END' }++;
377 $match->{ 'S_END' }++;
381 $match->{ 'Q_END' }--;
382 $match->{ 'S_END' }--;
387 sub match_expand_backward
389 # Martin A. Hansen, August 2009.
391 # Given a match and a search space hash,
392 # expand the match maximally in the backward
395 my ( $match, # match to expand
396 $space, # search space
401 while ( $match->{ 'Q_BEG' } >= $space->{ 'Q_MIN' } and
402 $match->{ 'S_BEG' } >= $space->{ 'S_MIN' } and
403 uc substr( ${ $space->{ 'Q_SEQ' } }, $match->{ 'Q_BEG' }, 1 ) eq uc substr( ${ $space->{ 'S_SEQ' } }, $match->{ 'S_BEG' }, 1 ) )
405 $match->{ 'Q_BEG'}--;
406 $match->{ 'S_BEG'}--;
410 $match->{ 'Q_BEG'}++;
411 $match->{ 'S_BEG'}++;
418 # Martin A. Hansen, August 2009
420 my ( $new_match, # match
421 $redundant, # hashref
428 foreach $match ( @{ $redundant->{ $new_match->{ 'Q_BEG' } } } )
430 if ( $new_match->{ 'S_BEG' } >= $match->{ 'S_BEG' } and $new_match->{ 'S_END' } <= $match->{ 'S_END' } ) {
439 sub match_redundant_add
441 # Martin A. Hansen, August 2009
444 $redundant, # hash ref
449 map { push @{ $redundant->{ $_ } }, $match } ( $match->{ 'Q_BEG' } .. $match->{ 'Q_END' } );
455 # Martin A. Hansen, August 2009.
457 my ( $space, # search space
458 $matches, # list of matches
463 my ( $best_match, $match, @new_matches );
465 $best_match = shift @{ $matches };
467 match_score( $best_match, $space );
469 foreach $match ( @{ $matches } )
471 match_score( $match, $space );
473 if ( $match->{ 'SCORE' } > 0 )
475 if ( $match->{ 'SCORE' } > $best_match->{ 'SCORE' } ) {
476 $best_match = $match;
478 push @new_matches, $match;
483 unshift @new_matches, $best_match if $best_match->{ 'SCORE' } > 0;
485 return wantarray ? @new_matches : \@new_matches;
491 # Martin A. Hansen, August 2009
493 # Given a match and a search space scores the match according to three criteria:
495 # 1) length of match - the longer the better.
496 # 2) distance to closest corner - the shorter the better.
497 # 3) distance to closest narrow end of the search space - the shorter the better.
499 # Each of these scores are divided by search space dimensions, and the total score
500 # is calculated: total = score_len - score_corner - score_narrow.
502 # The higher the score, the better the match.
505 $space, # search space
510 my ( $score_len, $score_corner, $score_diag, $score_narrow, $score_total );
512 $score_len = match_score_len( $match );
513 $score_corner = match_score_corner( $match, $space );
514 $score_diag = match_score_diag( $match, $space );
515 $score_narrow = 0; #match_score_narrow( $match, $space );
517 $score_total = $score_len - $score_corner - $score_diag - $score_narrow;
519 # print STDERR "LEN: $score_len CORNER: $score_corner DIAG: $score_diag NARROW: $score_narrow: TOTAL: $score_total\n" if $score_total > 0;
520 print STDERR "LEN: $score_len CORNER: $score_corner DIAG: $score_diag NARROW: $score_narrow: TOTAL: $score_total\n";
522 $match->{ 'SCORE' } = $score_total;
528 # Martin A. Hansen, August 2009
530 # Score according to match length.
537 return $match->{ 'LEN' } * 2;
541 sub match_score_corner
543 # Martin A. Hansen, August 2009
545 # Score according to distance from corner.
548 $space, # search space
553 my ( $left_dist, $right_dist, $score_corner );
555 $left_dist = Maasha::Calc::dist_point2point( $space->{ 'Q_MIN' }, $space->{ 'S_MIN' }, $match->{ 'Q_BEG' } , $match->{ 'S_BEG' } );
556 $right_dist = Maasha::Calc::dist_point2point( $space->{ 'Q_MAX' }, $space->{ 'S_MAX' }, $match->{ 'Q_END' } , $match->{ 'S_END' } );
558 $score_corner = Maasha::Calc::min( $left_dist, $right_dist );
560 return int $score_corner * 1.5;
566 # Martin A. Hansen, August 2009
568 # Score according to distance from diagonal.
571 $space, # search space
576 my ( $q_dim, $s_dim, $beg_diag_dist, $end_diag_dist, $min_diag_dist, $score_diag );
578 $q_dim = $space->{ 'Q_MAX' } - $space->{ 'Q_MIN' } + 1;
579 $s_dim = $space->{ 'S_MAX' } - $space->{ 'S_MIN' } + 1;
581 if ( $q_dim >= $s_dim ) # s_dim is the narrow end
583 $beg_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' },
587 $space->{ 'Q_MIN' } + $s_dim,
588 $space->{ 'S_MIN' } + $s_dim );
590 $end_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' },
592 $space->{ 'Q_MAX' } - $s_dim,
593 $space->{ 'S_MAX' } - $s_dim,
595 $space->{ 'S_MAX' } );
599 $beg_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' },
603 $space->{ 'Q_MIN' } + $q_dim,
604 $space->{ 'S_MIN' } + $q_dim );
606 $end_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' },
608 $space->{ 'Q_MAX' } - $q_dim,
609 $space->{ 'S_MAX' } - $q_dim,
611 $space->{ 'S_MAX' } );
614 $min_diag_dist = Maasha::Calc::min( $beg_diag_dist, $end_diag_dist );
616 $score_diag = $min_diag_dist;
618 return int $score_diag;
622 sub match_score_narrow
624 # Martin A. Hansen, August 2009
626 # Score according to distance to the narrow end of the search space.
629 $space, # search space
634 my ( $max_narrow_dist, $q_dim, $s_dim, $beg_narrow_dist, $end_narrow_dist, $score_narrow );
636 $max_narrow_dist = 0;
638 $q_dim = $space->{ 'Q_MAX' } - $space->{ 'Q_MIN' } + 1;
639 $s_dim = $space->{ 'S_MAX' } - $space->{ 'S_MIN' } + 1;
641 if ( $q_dim > $s_dim ) # s_dim is the narrow end
643 $beg_narrow_dist = $match->{ 'Q_BEG' } - $space->{ 'Q_MIN' };
644 $end_narrow_dist = $space->{ 'Q_MAX' } - $match->{ 'Q_BEG' };
646 $max_narrow_dist = Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist );
648 elsif ( $q_dim < $s_dim ) # q_dim is the narrow end
650 $beg_narrow_dist = $match->{ 'S_BEG' } - $space->{ 'S_MIN' };
651 $end_narrow_dist = $space->{ 'S_MAX' } - $match->{ 'S_BEG' };
653 $max_narrow_dist = Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist );
656 $score_narrow = $max_narrow_dist;
658 return abs( $beg_narrow_dist - $end_narrow_dist );
660 return $score_narrow;
666 # Martin A. Hansen, August 2009.
668 # Given a list of matches and sequence references
669 # uppercase only matching sequnce while lowercasing the rest.
671 my ( $matches, # list of matches
672 $q_seq, # query sequence ref
673 $s_seq, # subject sequence ref
680 ${ $q_seq } = lc ${ $q_seq };
681 ${ $s_seq } = lc ${ $s_seq };
683 foreach $match ( @{ $matches } )
685 substr ${ $q_seq }, $match->{ 'Q_BEG' }, $match->{ 'LEN' }, uc substr ${ $q_seq }, $match->{ 'Q_BEG' }, $match->{ 'LEN' };
686 substr ${ $s_seq }, $match->{ 'S_BEG' }, $match->{ 'LEN' }, uc substr ${ $s_seq }, $match->{ 'S_BEG' }, $match->{ 'LEN' };
694 # Martin A. Hansen, August 2009.
696 # Given a list of matches and sequence references
697 # insert indels to form aligned sequences.
699 my ( $matches, # list of matches
700 $q_seq, # query sequence ref
701 $s_seq, # subject sequence ref
704 my ( $i, $q_dist, $s_dist, $q_indels, $s_indels, $diff );
709 if ( @{ $matches } > 0 )
711 @{ $matches } = sort { $a->{ 'Q_BEG' } <=> $b->{ 'Q_BEG' } } @{ $matches }; # FIXME is this necessary?
713 # >>>>>>>>>>>>>> FIRST MATCH <<<<<<<<<<<<<<
715 $diff = $matches->[ 0 ]->{ 'Q_BEG' } - $matches->[ 0 ]->{ 'S_BEG' };
719 substr ${ $s_seq }, 0, 0, '-' x $diff;
725 substr ${ $q_seq }, 0, 0, '-' x abs $diff;
727 $q_indels += abs $diff;
730 # >>>>>>>>>>>>>> MIDDLE MATCHES <<<<<<<<<<<<<<
732 for ( $i = 0; $i < scalar @{ $matches } - 1; $i++ )
734 $q_dist = $matches->[ $i + 1 ]->{ 'Q_BEG' } - $matches->[ $i ]->{ 'Q_END' };
735 $s_dist = $matches->[ $i + 1 ]->{ 'S_BEG' } - $matches->[ $i ]->{ 'S_END' };
737 $diff = $q_dist - $s_dist;
741 substr ${ $s_seq }, $s_indels + $matches->[ $i ]->{ 'S_END' } + 1, 0, '-' x $diff;
747 substr ${ $q_seq }, $q_indels + $matches->[ $i ]->{ 'Q_END' } + 1, 0, '-' x abs $diff;
749 $q_indels += abs $diff;
754 # >>>>>>>>>>>>>> LAST MATCH <<<<<<<<<<<<<<
756 $diff = length( ${ $q_seq } ) - length( ${ $s_seq } );
760 ${ $s_seq } .= '-' x $diff;
766 ${ $q_seq } .= '-' x abs $diff;
768 $q_indels += abs $diff;
778 # Martin A. Hansen, May 2007.
780 # Calculate the local similarity of an alignment based on
781 # an alignment chain. The similarity is calculated as
782 # the number of matching residues divided by the overall
783 # length of the alignment chain. This means that a short
784 # but "good" alignment will yield a high similarity, while
785 # a long "poor" alignment will yeild a low similarity.
787 my ( $chain, # list of matches in alignment
792 my ( $match, $match_tot, $q_beg, $q_end, $s_beg, $s_end, $q_diff, $s_diff, $max, $sim );
794 return 0 if not @{ $chain };
802 foreach $match ( @{ $chain } )
804 $match_tot += $match->[ LEN ];
806 $q_beg = $match->[ Q_BEG ] if $match->[ Q_BEG ] < $q_beg;
807 $s_beg = $match->[ S_BEG ] if $match->[ S_BEG ] < $s_beg;
809 $q_end = $match->[ Q_END ] if $match->[ Q_END ] > $q_end;
810 $s_end = $match->[ S_END ] if $match->[ S_END ] > $s_end;
813 $q_diff = $q_end - $q_beg + 1;
814 $s_diff = $s_end - $s_beg + 1;
816 $max = Maasha::Calc::max( $q_diff, $s_diff );
818 $sim = sprintf( "%.2f", ( $match_tot / $max ) * 100 );
826 # Martin A. Hansen, June 2007.
828 # Calculate the global similarity of an alignment based on
829 # an alignment chain. The similarity is calculated as
830 # the number of matching residues divided by the
831 # length of the shortest sequence.
833 my ( $chain, # list of matches in alignment
834 $q_seq, # ref to query sequence
835 $s_seq, # ref to subject sequence
840 my ( $match_tot, $min, $sim );
842 return 0 if not @{ $chain };
846 map { $match_tot += $_->[ LEN ] } @{ $chain };
848 $min = Maasha::Calc::min( length( ${ $q_seq } ), length( ${ $s_seq } ) );
850 $sim = sprintf( "%.2f", ( $match_tot / $min ) * 100 );
856 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<