1 package Maasha::AlignTwoSeq;
3 # Copyright (C) 2007 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Routines to create a pair-wise alignment.
28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
36 use vars qw ( @ISA @EXPORT );
38 @ISA = qw( Exporter );
41 SCORE_FACTOR_LEN => 3,
42 SCORE_FACTOR_CORNER => 2,
43 SCORE_FACTOR_DIAG => 1,
44 SCORE_FACTOR_NARROW => 0.5,
49 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
54 # Martin A. Hansen, August 2006.
56 # Generates an alignment by chaining matches, which are subsequences
57 # shared between two sequences. The routine functions by considering
58 # only matches within a given search space. If no matches are given
59 # these will be located and matches will be included depending on a
60 # calculated score. New search spaces spanning the spaces between
61 # matches and the search space boundaries will be cast and recursed
64 # Usage: align_two_seq( { Q_SEQ => \$q_seq, S_SEQ => \$s_seq }, [] );
66 my ( $space, # search space
67 $matches, # list of matches
72 my ( @chain, $best_match, $left_space, $right_space );
76 matches_select( $matches, $space );
78 if ( scalar @{ $matches } == 0 ) # no matches - find some!
80 $matches = matches_find( $space );
82 if ( scalar @{ $matches } > 0 ) {
83 push @chain, align_two_seq( $space, $matches );
86 else # matches are included according to score
88 $matches = matches_filter( $space, $matches );
90 $best_match = shift @{ $matches };
94 push @chain, $best_match;
96 $left_space = new_space_left( $best_match, $space );
97 $right_space = new_space_right( $best_match, $space );
99 push @chain, align_two_seq( $left_space, $matches ) if defined $left_space;
100 push @chain, align_two_seq( $right_space, $matches ) if defined $right_space;
104 return wantarray ? @chain : \@chain;
110 # Martin A. Hansen, August 2009.
112 # Define a new search space if not previously
113 # defined. This occurs if align_two_seq() is
114 # called initially with only the sequences.
116 my ( $space, # search space
121 if ( not defined $space->{ 'Q_MAX' } )
123 $space->{ 'Q_MIN' } = 0;
124 $space->{ 'S_MIN' } = 0;
125 $space->{ 'Q_MAX' } = length( ${ $space->{ 'Q_SEQ' } } ) - 1,
126 $space->{ 'S_MAX' } = length( ${ $space->{ 'S_SEQ' } } ) - 1,
133 # Martin A. Hansen, August 2009.
135 # Create a new search space between the beginning of
136 # the current search space and the best match.
138 my ( $best_match, # best match
139 $space, # search space
146 if ( $best_match->{ 'Q_BEG' } - $space->{ 'Q_MIN' } >= 2 and $best_match->{ 'S_BEG' } - $space->{ 'S_MIN' } >= 2 )
149 Q_SEQ => $space->{ 'Q_SEQ' },
150 S_SEQ => $space->{ 'S_SEQ' },
151 Q_MIN => $space->{ 'Q_MIN' },
152 Q_MAX => $best_match->{ 'Q_BEG' } - 1,
153 S_MIN => $space->{ 'S_MIN' },
154 S_MAX => $best_match->{ 'S_BEG' } - 1,
158 return wantarray ? %{ $new_space } : $new_space;
164 # Martin A. Hansen, August 2009.
166 # Create a new search space between the best match
167 # and the end of the current search space.
169 my ( $best_match, # best match
170 $space, # search space
177 if ( $space->{ 'Q_MAX' } - $best_match->{ 'Q_END' } >= 2 and $space->{ 'S_MAX' } - $best_match->{ 'S_END' } >= 2 )
180 Q_SEQ => $space->{ 'Q_SEQ' },
181 S_SEQ => $space->{ 'S_SEQ' },
182 Q_MIN => $best_match->{ 'Q_END' } + 1,
183 Q_MAX => $space->{ 'Q_MAX' },
184 S_MIN => $best_match->{ 'S_END' } + 1,
185 S_MAX => $space->{ 'S_MAX' },
189 return wantarray ? %{ $new_space } : $new_space;
195 # Martin A. Hansen, August 2006.
197 # Given a list of matches and a search space,
198 # include only those matches contained within
201 my ( $matches, # list of matches
202 $space, # search space
207 @{ $matches } = grep { $_->{ 'Q_BEG' } >= $space->{ 'Q_MIN' } and
208 $_->{ 'S_BEG' } >= $space->{ 'S_MIN' } and
209 $_->{ 'Q_END' } <= $space->{ 'Q_MAX' } and
210 $_->{ 'S_END' } <= $space->{ 'S_MAX' } } @{ $matches };
216 # Martin A. Hansen, August 2008.
218 # Given a search space calculates the wordsize for a word so a match
219 # occurs only a few times. More matches may be needed at low similarity in
220 # order to avoid starting with a wrong match.
222 my ( $space, # search space
227 my ( $factor, $word_size );
231 if ( $space->{ 'Q_MAX' } - $space->{ 'Q_MIN' } < $space->{ 'S_MAX' } - $space->{ 'S_MIN' } ) {
232 $word_size = int( $factor * ( $space->{ 'Q_MAX' } - $space->{ 'Q_MIN' } ) ) + 1;
234 $word_size = int( $factor * ( $space->{ 'S_MAX' } - $space->{ 'S_MIN' } ) ) + 1;
243 # Martin A. Hansen, August 2009.
245 # Indexes a query sequence in a specified search space
246 # Overlapping words are saved as keys in an index hash,
247 # and a list of word postions are saved as value.
249 my ( $space, # search space
250 $word_size, # word size
255 my ( $i, $word, %index );
257 for ( $i = $space->{ 'Q_MIN' }; $i <= $space->{ 'Q_MAX' } - $word_size + 1; $i++ )
259 $word = uc substr ${ $space->{ 'Q_SEQ' } }, $i, $word_size;
261 next if $word =~ tr/N//; # Skip sequences with Ns.
263 push @{ $index{ $word } }, $i;
266 return wantarray ? %index : \%index;
272 # Martin A. Hansen, August 2009.
274 my ( $index, # sequence index
275 $space, # search space
276 $word_size, # word size
281 my ( $i, $word, $pos, $match, @matches, $redundant );
285 for ( $i = $space->{ 'S_MIN' }; $i <= $space->{ 'S_MAX' } - $word_size + 1; $i++ )
287 $word = uc substr ${ $space->{ 'S_SEQ' } }, $i, $word_size;
289 if ( exists $index->{ $word } )
291 foreach $pos ( @{ $index->{ $word } } )
295 'Q_END' => $pos + $word_size - 1,
297 'S_END' => $i + $word_size - 1,
302 if ( not match_redundant( $match, $redundant ) )
304 match_expand( $match, $space );
306 push @matches, $match;
308 match_redundant_add( $match, $redundant );
314 return wantarray ? @matches : \@matches;
320 # Martin A. Hansen, November 2006
322 # Given two sequences, find all maximum expanded matches between these.
324 my ( $space, # search space
327 # returns list of matches
329 my ( $word_size, $index, $matches );
331 $word_size = word_size_calc( $space );
335 while ( scalar @{ $matches } == 0 and $word_size > 0 )
337 $index = seq_index( $space, $word_size );
338 $matches = seq_scan( $index, $space, $word_size );
343 return wantarray ? @{ $matches } : $matches;
349 # Martin A. Hansen, August 2009.
351 # Given a match and a search space hash,
352 # expand the match maximally both forward and
353 # backward direction.
355 my ( $match, # match to expand
356 $space, # search space
361 match_expand_forward( $match, $space );
362 match_expand_backward( $match, $space );
366 sub match_expand_forward
368 # Martin A. Hansen, August 2009.
370 # Given a match and a search space hash,
371 # expand the match maximally in the forward
374 my ( $match, # match to expand
375 $space, # search space
380 while ( $match->{ 'Q_END' } <= $space->{ 'Q_MAX' } and
381 $match->{ 'S_END' } <= $space->{ 'S_MAX' } and
382 uc substr( ${ $space->{ 'Q_SEQ' } }, $match->{ 'Q_END' }, 1 ) eq uc substr( ${ $space->{ 'S_SEQ' } }, $match->{ 'S_END' }, 1 ) )
384 $match->{ 'Q_END' }++;
385 $match->{ 'S_END' }++;
389 $match->{ 'Q_END' }--;
390 $match->{ 'S_END' }--;
395 sub match_expand_backward
397 # Martin A. Hansen, August 2009.
399 # Given a match and a search space hash,
400 # expand the match maximally in the backward
403 my ( $match, # match to expand
404 $space, # search space
409 while ( $match->{ 'Q_BEG' } >= $space->{ 'Q_MIN' } and
410 $match->{ 'S_BEG' } >= $space->{ 'S_MIN' } and
411 uc substr( ${ $space->{ 'Q_SEQ' } }, $match->{ 'Q_BEG' }, 1 ) eq uc substr( ${ $space->{ 'S_SEQ' } }, $match->{ 'S_BEG' }, 1 ) )
413 $match->{ 'Q_BEG'}--;
414 $match->{ 'S_BEG'}--;
418 $match->{ 'Q_BEG'}++;
419 $match->{ 'S_BEG'}++;
426 # Martin A. Hansen, August 2009
428 my ( $new_match, # match
429 $redundant, # hashref
436 foreach $match ( @{ $redundant->{ $new_match->{ 'Q_BEG' } } } )
438 if ( $new_match->{ 'S_BEG' } >= $match->{ 'S_BEG' } and $new_match->{ 'S_END' } <= $match->{ 'S_END' } ) {
447 sub match_redundant_add
449 # Martin A. Hansen, August 2009
452 $redundant, # hash ref
457 map { push @{ $redundant->{ $_ } }, $match } ( $match->{ 'Q_BEG' } .. $match->{ 'Q_END' } );
463 # Martin A. Hansen, August 2009.
465 my ( $space, # search space
466 $matches, # list of matches
471 my ( $best_match, $match, @new_matches );
473 $best_match = shift @{ $matches };
475 match_score( $best_match, $space );
477 foreach $match ( @{ $matches } )
479 match_score( $match, $space );
481 if ( $match->{ 'SCORE' } > 0 )
483 if ( $match->{ 'SCORE' } > $best_match->{ 'SCORE' } ) {
484 $best_match = $match;
486 push @new_matches, $match;
491 unshift @new_matches, $best_match if $best_match->{ 'SCORE' } > 0;
493 return wantarray ? @new_matches : \@new_matches;
499 # Martin A. Hansen, August 2009
501 # Given a match and a search space scores the match according to three criteria:
503 # 1) length of match - the longer the better.
504 # 2) distance to closest corner - the shorter the better.
505 # 3) distance to closest narrow end of the search space - the shorter the better.
507 # Each of these scores are divided by search space dimensions, and the total score
508 # is calculated: total = score_len - score_corner - score_narrow.
510 # The higher the score, the better the match.
513 $space, # search space
518 my ( $score_len, $score_corner, $score_diag, $score_narrow, $score_total );
520 $score_len = match_score_len( $match );
521 $score_corner = match_score_corner( $match, $space );
522 $score_diag = match_score_diag( $match, $space );
523 $score_narrow = match_score_narrow( $match, $space );
525 $score_total = $score_len - $score_corner - $score_diag - $score_narrow;
527 printf STDERR "LEN: %d CORNER: %d DIAG: %d NARROW: %d TOTAL: %d\n", $score_len, $score_corner, $score_diag, $score_narrow, $score_total if VERBOSE;
529 $match->{ 'SCORE' } = $score_total;
535 # Martin A. Hansen, August 2009
537 # Score according to match length.
544 return $match->{ 'LEN' } * SCORE_FACTOR_LEN;
548 sub match_score_corner
550 # Martin A. Hansen, August 2009
552 # Score according to distance from corner.
555 $space, # search space
560 my ( $left_dist, $right_dist, $score_corner );
562 $left_dist = Maasha::Calc::dist_point2point( $space->{ 'Q_MIN' }, $space->{ 'S_MIN' }, $match->{ 'Q_BEG' } , $match->{ 'S_BEG' } );
563 $right_dist = Maasha::Calc::dist_point2point( $space->{ 'Q_MAX' }, $space->{ 'S_MAX' }, $match->{ 'Q_END' } , $match->{ 'S_END' } );
565 $score_corner = Maasha::Calc::min( $left_dist, $right_dist );
567 return $score_corner * SCORE_FACTOR_CORNER;
573 # Martin A. Hansen, August 2009
575 # Score according to distance from diagonal.
578 $space, # search space
583 my ( $q_dim, $s_dim, $beg_diag_dist, $end_diag_dist, $min_diag_dist, $score_diag );
585 $q_dim = $space->{ 'Q_MAX' } - $space->{ 'Q_MIN' } + 1;
586 $s_dim = $space->{ 'S_MAX' } - $space->{ 'S_MIN' } + 1;
588 if ( $q_dim >= $s_dim ) # s_dim is the narrow end
590 $beg_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' },
594 $space->{ 'Q_MIN' } + $s_dim,
595 $space->{ 'S_MIN' } + $s_dim );
597 $end_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' },
599 $space->{ 'Q_MAX' } - $s_dim,
600 $space->{ 'S_MAX' } - $s_dim,
602 $space->{ 'S_MAX' } );
606 $beg_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' },
610 $space->{ 'Q_MIN' } + $q_dim,
611 $space->{ 'S_MIN' } + $q_dim );
613 $end_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' },
615 $space->{ 'Q_MAX' } - $q_dim,
616 $space->{ 'S_MAX' } - $q_dim,
618 $space->{ 'S_MAX' } );
621 $min_diag_dist = Maasha::Calc::min( $beg_diag_dist, $end_diag_dist );
623 $score_diag = $min_diag_dist;
625 return $score_diag * SCORE_FACTOR_DIAG;
629 sub match_score_narrow
631 # Martin A. Hansen, August 2009
633 # Score according to distance to the narrow end of the search space.
636 $space, # search space
641 my ( $max_narrow_dist, $q_dim, $s_dim, $beg_narrow_dist, $end_narrow_dist, $score_narrow );
643 $max_narrow_dist = 0;
645 $q_dim = $space->{ 'Q_MAX' } - $space->{ 'Q_MIN' } + 1;
646 $s_dim = $space->{ 'S_MAX' } - $space->{ 'S_MIN' } + 1;
648 if ( $q_dim > $s_dim ) # s_dim is the narrow end
650 $beg_narrow_dist = $match->{ 'Q_BEG' } - $space->{ 'Q_MIN' };
651 $end_narrow_dist = $space->{ 'Q_MAX' } - $match->{ 'Q_BEG' };
653 $max_narrow_dist = Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist );
655 elsif ( $q_dim < $s_dim ) # q_dim is the narrow end
657 $beg_narrow_dist = $match->{ 'S_BEG' } - $space->{ 'S_MIN' };
658 $end_narrow_dist = $space->{ 'S_MAX' } - $match->{ 'S_BEG' };
660 $max_narrow_dist = Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist );
663 $score_narrow = $max_narrow_dist;
665 return $score_narrow * SCORE_FACTOR_NARROW;
671 # Martin A. Hansen, August 2009.
673 # Given a list of matches and sequence references
674 # uppercase only matching sequnce while lowercasing the rest.
676 my ( $matches, # list of matches
677 $q_seq, # query sequence ref
678 $s_seq, # subject sequence ref
685 ${ $q_seq } = lc ${ $q_seq };
686 ${ $s_seq } = lc ${ $s_seq };
688 foreach $match ( @{ $matches } )
690 substr ${ $q_seq }, $match->{ 'Q_BEG' }, $match->{ 'LEN' }, uc substr ${ $q_seq }, $match->{ 'Q_BEG' }, $match->{ 'LEN' };
691 substr ${ $s_seq }, $match->{ 'S_BEG' }, $match->{ 'LEN' }, uc substr ${ $s_seq }, $match->{ 'S_BEG' }, $match->{ 'LEN' };
699 # Martin A. Hansen, August 2009.
701 # Given a list of matches and sequence references
702 # insert indels to form aligned sequences.
704 my ( $matches, # list of matches
705 $q_seq, # query sequence ref
706 $s_seq, # subject sequence ref
709 my ( $i, $q_dist, $s_dist, $q_indels, $s_indels, $diff );
714 if ( @{ $matches } > 0 )
716 @{ $matches } = sort { $a->{ 'Q_BEG' } <=> $b->{ 'Q_BEG' } } @{ $matches }; # FIXME is this necessary?
718 # >>>>>>>>>>>>>> FIRST MATCH <<<<<<<<<<<<<<
720 $diff = $matches->[ 0 ]->{ 'Q_BEG' } - $matches->[ 0 ]->{ 'S_BEG' };
724 substr ${ $s_seq }, 0, 0, '-' x $diff;
730 substr ${ $q_seq }, 0, 0, '-' x abs $diff;
732 $q_indels += abs $diff;
735 # >>>>>>>>>>>>>> MIDDLE MATCHES <<<<<<<<<<<<<<
737 for ( $i = 0; $i < scalar @{ $matches } - 1; $i++ )
739 $q_dist = $matches->[ $i + 1 ]->{ 'Q_BEG' } - $matches->[ $i ]->{ 'Q_END' };
740 $s_dist = $matches->[ $i + 1 ]->{ 'S_BEG' } - $matches->[ $i ]->{ 'S_END' };
742 $diff = $q_dist - $s_dist;
746 substr ${ $s_seq }, $s_indels + $matches->[ $i ]->{ 'S_END' } + 1, 0, '-' x $diff;
752 substr ${ $q_seq }, $q_indels + $matches->[ $i ]->{ 'Q_END' } + 1, 0, '-' x abs $diff;
754 $q_indels += abs $diff;
759 # >>>>>>>>>>>>>> LAST MATCH <<<<<<<<<<<<<<
761 $diff = length( ${ $q_seq } ) - length( ${ $s_seq } );
765 ${ $s_seq } .= '-' x $diff;
771 ${ $q_seq } .= '-' x abs $diff;
773 $q_indels += abs $diff;
784 # Martin A. Hansen, May 2007.
786 # Calculate the local similarity of an alignment based on
787 # an alignment chain. The similarity is calculated as
788 # the number of matching residues divided by the overall
789 # length of the alignment chain. This means that a short
790 # but "good" alignment will yield a high similarity, while
791 # a long "poor" alignment will yeild a low similarity.
793 my ( $chain, # list of matches in alignment
798 my ( $match, $match_tot, $q_beg, $q_end, $s_beg, $s_end, $q_diff, $s_diff, $max, $sim );
800 return 0 if not @{ $chain };
808 foreach $match ( @{ $chain } )
810 $match_tot += $match->[ LEN ];
812 $q_beg = $match->[ Q_BEG ] if $match->[ Q_BEG ] < $q_beg;
813 $s_beg = $match->[ S_BEG ] if $match->[ S_BEG ] < $s_beg;
815 $q_end = $match->[ Q_END ] if $match->[ Q_END ] > $q_end;
816 $s_end = $match->[ S_END ] if $match->[ S_END ] > $s_end;
819 $q_diff = $q_end - $q_beg + 1;
820 $s_diff = $s_end - $s_beg + 1;
822 $max = Maasha::Calc::max( $q_diff, $s_diff );
824 $sim = sprintf( "%.2f", ( $match_tot / $max ) * 100 );
832 # Martin A. Hansen, June 2007.
834 # Calculate the global similarity of an alignment based on
835 # an alignment chain. The similarity is calculated as
836 # the number of matching residues divided by the
837 # length of the shortest sequence.
839 my ( $chain, # list of matches in alignment
840 $q_seq, # ref to query sequence
841 $s_seq, # ref to subject sequence
846 my ( $match_tot, $min, $sim );
848 return 0 if not @{ $chain };
852 map { $match_tot += $_->[ LEN ] } @{ $chain };
854 $min = Maasha::Calc::min( length( ${ $q_seq } ), length( ${ $s_seq } ) );
856 $sim = sprintf( "%.2f", ( $match_tot / $min ) * 100 );
862 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<