1 package Maasha::AlignTwoSeq;
3 # Copyright (C) 2007 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
33 use Storable qw( dclone );
36 use vars qw ( @ISA @EXPORT );
50 @ISA = qw( Exporter );
53 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
58 # Martin A. Hansen, August 2006.
60 # Generates an alignment by chaining matches, which are subsequences
61 # shared between two sequences. The routine functions by considering
62 # only matches within a given search space. If no matches are given
63 # these will be generated, if long matches are found these will be
64 # included in the alignment, otherwise matches will be included depending
65 # on a calculated score. New search spaces spanning the spaces between
66 # matches and the search space boundaries will be cast and recursed into.
68 my ( $q_seq, # sequence 1 ref
69 $s_seq, # sequence 2 ref
70 $matches, # list of matches
71 $q_min, # q sequence start position
72 $q_max, # q sequecne stop position
73 $s_min, # s sequence start position
74 $s_max, # s sequecne stop position
77 # returns a chain of matches that can be chained into an alignment
82 $q_max ||= length( ${ $q_seq } ) - 1;
83 $s_max ||= length( ${ $s_seq } ) - 1;
85 my ( $wordsize, @chain, $match, $best_match, @long_matches );
87 $matches = select_matches( $matches, $q_min, $q_max, $s_min, $s_max );
89 if ( scalar @{ $matches } == 0 ) # no matches - find some!
91 $wordsize = find_wordsize( $q_min, $q_max, $s_min, $s_max );
92 $matches = find_matches( $q_seq, $s_seq, $wordsize, $q_min, $q_max, $s_min, $s_max );
94 while ( scalar @{ $matches } == 0 and $wordsize > 1 )
97 $matches = find_matches( $q_seq, $s_seq, $wordsize, $q_min, $q_max, $s_min, $s_max );
100 if ( scalar @{ $matches } > 0 ) {
101 push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $q_max, $s_min, $s_max );
104 else # matches are included according to score
106 foreach $match ( @{ $matches } ) {
107 $match->[ SCORE ] = score_match( $match, $q_min, $q_max, $s_min, $s_max );
110 @{ $matches } = grep { $_->[ SCORE ] > 0 } @{ $matches };
111 @{ $matches } = sort { $b->[ SCORE ] <=> $a->[ SCORE ] } @{ $matches };
113 $best_match = shift @{ $matches };
117 push @chain, $best_match;
119 if ( $best_match->[ Q_BEG ] - $q_min >= 2 and $best_match->[ S_BEG ] - $s_min >= 2 ) {
120 push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $best_match->[ Q_BEG ] - 1, $s_min, $best_match->[ S_BEG ] - 1 ); # left search space
123 if ( $q_max - $best_match->[ Q_END ] >= 2 and $s_max - $best_match->[ S_END ] >= 2 ) {
124 push @chain, align_two_seq( $q_seq, $s_seq, $matches, $best_match->[ Q_END ] + 1, $q_max, $best_match->[ S_END ] + 1, $s_max ); # right search space
129 return wantarray ? @chain : \@chain;
135 # Martin A. Hansen, August 2006.
137 # Given a list of matches and a search space,
138 # include only those matches contained within
141 my ( $matches, # list of matches
142 $q_min, # q sequence start position
143 $q_max, # q sequecne stop position
144 $s_min, # s sequence start position
145 $s_max, # s sequecne stop position
148 # returns list of matches
152 @matches = grep { $_->[ Q_BEG ] >= $q_min and
153 $_->[ S_BEG ] >= $s_min and
154 $_->[ Q_END ] <= $q_max and
155 $_->[ S_END ] <= $s_max } @{ $matches };
157 return wantarray ? @matches : \@matches;
163 # Martin A. Hansen, August 2006.
165 # Given a search space calculates the wordsize for a word so a match
166 # occurs only a few times. More matches may be needed at low similarity in
167 # order to avoid starting with a wrong match.
169 my ( $q_min, # q sequence start position
170 $q_max, # q sequecne stop position
171 $s_min, # s sequence start position
172 $s_max, # s sequecne stop position
177 my ( $q_dim, $s_dim, $dim_min, $wordsize );
179 $q_dim = $q_max - $q_min + 1;
180 $s_dim = $s_max - $s_min + 1;
182 $dim_min = Maasha::Calc::min( $q_dim, $s_dim );
186 if ( $dim_min > 2000000 ) # optimized for DNA
190 elsif ( $dim_min > 100000 ) # optimized for DNA
194 elsif ( $q_dim > 100 or $s_dim > 100 ) # optimized for DNA
196 while ( ALPH_LEN ** $wordsize <= $q_dim * $s_dim and $wordsize < $dim_min ) {
202 while ( ALPH_LEN ** $wordsize <= $dim_min and $wordsize < $dim_min ) {
213 # Martin A. Hansen, November 2006
215 # given two sequences, find all maximum expanded matches between these
217 my ( $q_seq, # sequence 1
219 $wordsize, # word size
220 $q_min, # q sequence start position
221 $q_max, # q sequecne stop position
222 $s_min, # s sequence start position
223 $s_max, # s sequecne stop position
226 # returns list of matches
228 my ( $q_beg, $q_word, %word_hash, $s_beg, $s_word, $match, @matches );
230 if ( length ${ $s_seq } > length ${ $q_seq } )
232 for ( $q_beg = $q_min; $q_beg <= $q_max - $wordsize + 1; $q_beg++ )
234 $q_word = lc substr ${ $q_seq }, $q_beg, $wordsize;
236 next if $q_word =~ /n/i; # DNA/genome optimization
238 push @{ $word_hash{ $q_word } }, $q_beg;
241 for ( $s_beg = $s_min; $s_beg <= $s_max - $wordsize + 1; $s_beg++ )
243 $s_word = lc substr ${ $s_seq }, $s_beg, $wordsize;
245 if ( exists $word_hash{ $s_word } )
247 foreach $q_beg ( @{ $word_hash{ $s_word } } )
249 $match = [ $q_beg, $q_beg + $wordsize - 1, $s_beg, $s_beg + $wordsize - 1 ];
251 if ( grep { $match->[ Q_BEG ] >= $_->[ Q_BEG ] and
252 $match->[ Q_END ] <= $_->[ Q_END ] and
253 $match->[ S_BEG ] >= $_->[ S_BEG ] and
254 $match->[ S_END ] <= $_->[ S_END ] } @matches )
256 next; # match is redundant
260 $match = expand_match( $q_seq, $s_seq, $match, $q_max, $q_min, $s_max, $s_min );
261 $match->[ LEN ] = $match->[ Q_END ] - $match->[ Q_BEG ] + 1;
263 push @matches, $match;
271 for ( $s_beg = $s_min; $s_beg <= $s_max - $wordsize + 1; $s_beg++ )
273 $s_word = lc substr ${ $s_seq }, $s_beg, $wordsize;
275 next if $s_word =~ /n/i; # DNA/genome optimization
277 push @{ $word_hash{ $s_word } }, $s_beg;
280 for ( $q_beg = $q_min; $q_beg <= $q_max - $wordsize + 1; $q_beg++ )
282 $q_word = lc substr ${ $q_seq }, $q_beg, $wordsize;
284 if ( exists $word_hash{ $q_word } )
286 foreach $s_beg ( @{ $word_hash{ $q_word } } )
288 $match = [ $q_beg, $q_beg + $wordsize - 1, $s_beg, $s_beg + $wordsize - 1 ];
290 if ( grep { $match->[ Q_BEG ] >= $_->[ Q_BEG ] and
291 $match->[ Q_END ] <= $_->[ Q_END ] and
292 $match->[ S_BEG ] >= $_->[ S_BEG ] and
293 $match->[ S_END ] <= $_->[ S_END ] } @matches )
295 next; # match is redundant
299 $match = expand_match( $q_seq, $s_seq, $match, $q_max, $q_min, $s_max, $s_min );
300 $match->[ LEN ] = $match->[ Q_END ] - $match->[ Q_BEG ] + 1;
302 push @matches, $match;
309 return wantarray ? @matches : \@matches;
315 # Martin A. Hansen, August 2006.
317 # Given two sequences and a match, expand the match maximally.
318 # A match is defined like this: [ Q_BEG, Q_END, S_BEG, S_END ]
320 my ( $q_seq, # sequence 1 ref
321 $s_seq, # sequence 2 ref
322 $match, # sequence match
323 $q_max, # q sequecne stop position
324 $q_min, # q sequence start position
325 $s_max, # s sequecne stop position
326 $s_min, # s sequence start position
331 my ( $q_pos, $s_pos );
335 $q_pos = $match->[ Q_END ] + 1;
336 $s_pos = $match->[ S_END ] + 1;
338 while ( $q_pos <= $q_max and $s_pos <= $s_max and lc substr( ${ $q_seq }, $q_pos, 1 ) eq lc substr( ${ $s_seq }, $s_pos, 1 ) )
344 $match->[ Q_END ] = $q_pos - 1;
345 $match->[ S_END ] = $s_pos - 1;
347 # expanding backwards
349 $q_pos = $match->[ Q_BEG ] - 1;
350 $s_pos = $match->[ S_BEG ] - 1;
352 while ( $q_pos >= $q_min and $s_pos >= $s_min and lc substr( ${ $q_seq }, $q_pos, 1 ) eq lc substr( ${ $s_seq }, $s_pos, 1 ) )
358 $match->[ Q_BEG ] = $q_pos + 1;
359 $match->[ S_BEG ] = $s_pos + 1;
367 # Martin A. Hansen, August 2006
369 # given a match and a search space scores the match according to three criteria:
371 # 1) length of match - the longer the better.
372 # 2) distance to closest corner - the shorter the better.
373 # 3) distance to closest narrow end of the search space - the shorter the better.
375 # each of these scores are divided by search space dimentions, and the total score
376 # is calculated: total = score_len - score_corner - score_narrow
378 # the higher the score, the better the match.
381 $q_min, # q sequence start position
382 $q_max, # q sequecne stop position
383 $s_min, # s sequence start position
384 $s_max, # s sequecne stop position
387 # returns a positive number
389 my ( $q_dim, $s_dim, $score_len, $score_corner, $score_narrow, $score_total, $beg_diag_dist, $end_diag_dist,
390 $min_diag_dist, $score_diag, $beg_narrow_dist, $end_narrow_dist, $max_narrow_dist );
392 # ----- 1) scoring according to match length
394 $score_len = $match->[ LEN ] ** 3;
396 # ---- 2) score according to distance away from diagonal
398 $q_dim = $q_max - $q_min + 1;
399 $s_dim = $s_max - $s_min + 1;
401 if ( $q_dim >= $s_dim ) # s_dim is the narrow end
403 $beg_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_min, $s_min, $q_min + $s_dim, $s_min + $s_dim );
404 $end_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_max - $s_dim, $s_max - $s_dim, $q_max, $s_max );
408 $beg_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_min, $s_min, $q_min + $q_dim, $s_min + $q_dim );
409 $end_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_max - $q_dim, $s_max - $q_dim, $q_max, $s_max );
412 $min_diag_dist = Maasha::Calc::min( $beg_diag_dist, $end_diag_dist );
414 $score_diag = 2 * $min_diag_dist ** 2;
416 # ----- 3) scoring according to distance to the narrow end of the search space
418 if ( $q_dim > $s_dim ) # s_dim is the narrow end
420 $beg_narrow_dist = $match->[ Q_BEG ] - $q_min;
421 $end_narrow_dist = $q_max - $match->[ Q_BEG ];
423 $max_narrow_dist = Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist );
425 elsif ( $q_dim < $s_dim )
427 $beg_narrow_dist = $match->[ S_BEG ] - $s_min;
428 $end_narrow_dist = $s_max - $match->[ S_BEG ];
430 $max_narrow_dist = Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist );
434 $max_narrow_dist = 0;
437 $score_narrow = $max_narrow_dist;
439 $score_total = $score_len - $score_narrow - $score_diag;
445 sub score_match_niels
447 # Niels Larsen, June 2004.
449 # Creates a crude "heuristic" attempt of telling how likely it is that a
450 # given match occurs by chance in a given search space. If sequences are
451 # given their composition is taken into account. The scoring punishes
452 # distance from diagonal(s) and distance from previous match(es). Scores
453 # range from zero and up, and lower is better.
455 my ( $match, # Match array
456 $q_seq, # Either q_seq or s_seq
457 $q_min, # Lower bound search area (query sequence)
458 $q_max, # Upper bound search area (query sequence)
459 $s_min, # Lower bound search area (subject sequence)
460 $s_max, # Upper bound search area (subject sequence)
463 # Returns a positive number.
465 my ( $q_beg, $s_beg, $q_end, $s_end, $q_dim, $s_dim, $seq, $pos,
466 $q_delta_beg, $s_delta_beg, $q_delta_end, $s_delta_end, $i,
467 $delta_beg_max, $delta_end_max, $as, $gs, $ts, $cs, $pmatch,
468 $score, $moves, $dist_beg, $dist_end, $seqlen, %chars, $delta,
471 $q_beg = $match->[Q_BEG];
472 $q_end = $match->[Q_END];
473 $s_beg = $match->[S_BEG];
474 $s_end = $match->[S_END];
476 # >>>>>>>>>>>>>>>>>>>>>>> CRUDE INITIAL SCORE <<<<<<<<<<<<<<<<<<<<<<
478 # Get the probability of a match from the sequence composition (when
479 # match is longer than 20 and sequence is given, otherwise use 0.25)
480 # and raise that to the power of the length.
482 if ( $match->[LEN] > 20 and defined $q_seq )
484 $seq = substr ${ $q_seq }, $q_beg, $q_end-$q_beg+1;
485 $seqlen = length $seq;
487 $chars{"a"} = $chars{"g"} = $chars{"c"} = $chars{"t"} = 0;
489 for ( $i = 0; $i < $seqlen; $i++ )
491 $chars{ substr $seq, $i, 1 }++;
494 $pmatch = ($chars{"a"}/$seqlen)**2 + ($chars{"c"}/$seqlen)**2
495 + ($chars{"g"}/$seqlen)**2 + ($chars{"t"}/$seqlen)**2;
501 $score = $pmatch ** ( $q_end - $q_beg + 1 );
503 # # Punish by difference in height and width of search space,
505 # $q_dim = $q_max - $q_min + 1;
506 # $s_dim = $s_max - $s_min + 1;
508 # if ( $q_dim != $s_dim ) {
509 # $score *= abs ( $q_dim - $s_dim ) ** 2;
512 return $score if $score > 0.25;
514 # Punish by how far the match is to the closest corner of the search
517 $q_delta_beg = $q_beg - $q_min;
518 $s_delta_beg = $s_beg - $s_min;
520 $q_delta_end = $q_max - $q_end;
521 $s_delta_end = $s_max - $s_end;
523 if ( $q_delta_beg > $s_delta_beg ) {
524 $delta_beg_max = $q_delta_beg;
526 $delta_beg_max = $s_delta_beg;
529 if ( $q_delta_end > $s_delta_end ) {
530 $delta_end_max = $q_delta_end;
532 $delta_end_max = $s_delta_end;
535 if ( $delta_beg_max <= $delta_end_max ) {
536 $score *= ($delta_beg_max+1) ** 2.0;
538 $score *= ($delta_end_max+1) ** 2.0;
541 return $score if $score > 0.25;
543 # Add penalty if the match is towards the narrow end of the
546 if ( ($q_max - $q_min) <= ($s_max - $s_min) )
548 if ( $q_delta_beg > $s_delta_beg )
550 $score *= 2 * ( $q_delta_beg - $s_delta_beg ) ** 3;
552 elsif ( $q_delta_end > $s_delta_end )
554 $score *= 2 * ( $q_delta_end - $s_delta_end ) ** 3;
559 if ( $s_delta_beg > $q_delta_beg )
561 $score *= 2 * ( $s_delta_beg - $q_delta_beg ) ** 3;
563 elsif ( $s_delta_end > $q_delta_end )
565 $score *= 2 * ( $s_delta_end - $q_delta_end ) ** 3;
570 print STDERR "q_min, q_max, s_min, s_max: $q_min, $q_max, $s_min, $s_max\n";
571 die qq (Score <= 0 -> $score);
580 # Martin A. Hansen, June 2009.
582 # Given a list of matches and references to q_seq and s_seq,
583 # insert indels to form aligned sequences.
585 my ( $matches, # list of matches
586 $q_seq, # ref to query sequence
587 $s_seq, # ref to subject sequence
592 my ( $q_indels, $s_indels, $match, $diff, $first );
594 @{ $matches } = sort { $a->[ Q_BEG ] <=> $b->[ Q_BEG ] } @{ $matches };
600 # ---- initial indels ----
602 $match = shift @{ $matches };
604 substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ], uc substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ];
605 substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ], uc substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ];
607 $diff = ( $match->[ Q_BEG ] + $q_indels ) - ( $match->[ S_BEG ] + $s_indels );
611 substr ${ $q_seq }, 0, 0, '-' x abs $diff;
613 $q_indels += abs $diff;
617 substr ${ $s_seq }, 0, 0, '-' x abs $diff;
619 $s_indels += abs $diff;
622 # ---- internal indels ----
624 foreach $match ( @{ $matches } )
626 substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ], uc substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ];
627 substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ], uc substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ];
629 $diff = ( $match->[ Q_BEG ] + $q_indels ) - ( $match->[ S_BEG ] + $s_indels );
633 substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, 0, '-' x abs $diff;
635 $q_indels += abs $diff;
639 substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, 0, '-' x abs $diff;
641 $s_indels += abs $diff;
645 # ---- last indels ----
647 $diff = length( ${ $s_seq } ) - length( ${ $q_seq } );
650 ${ $q_seq } .= '-' x abs $diff;
652 ${ $s_seq } .= '-' x abs $diff;
659 # Martin A. Hansen, May 2007.
661 # Calculate the local similarity of an alignment based on
662 # an alignment chain. The similarity is calculated as
663 # the number of matching residues divided by the overall
664 # length of the alignment chain. This means that a short
665 # but "good" alignment will yield a high similarity, while
666 # a long "poor" alignment will yeild a low similarity.
668 my ( $chain, # list of matches in alignment
673 my ( $match, $match_tot, $q_beg, $q_end, $s_beg, $s_end, $q_diff, $s_diff, $max, $sim );
675 return 0 if not @{ $chain };
683 foreach $match ( @{ $chain } )
685 $match_tot += $match->[ LEN ];
687 $q_beg = $match->[ Q_BEG ] if $match->[ Q_BEG ] < $q_beg;
688 $s_beg = $match->[ S_BEG ] if $match->[ S_BEG ] < $s_beg;
690 $q_end = $match->[ Q_END ] if $match->[ Q_END ] > $q_end;
691 $s_end = $match->[ S_END ] if $match->[ S_END ] > $s_end;
694 $q_diff = $q_end - $q_beg + 1;
695 $s_diff = $s_end - $s_beg + 1;
697 $max = Maasha::Calc::max( $q_diff, $s_diff );
699 $sim = sprintf( "%.2f", ( $match_tot / $max ) * 100 );
707 # Martin A. Hansen, June 2007.
709 # Calculate the global similarity of an alignment based on
710 # an alignment chain. The similarity is calculated as
711 # the number of matching residues divided by the
712 # length of the shortest sequence.
714 my ( $chain, # list of matches in alignment
715 $q_seq, # ref to query sequence
716 $s_seq, # ref to subject sequence
721 my ( $match_tot, $min, $sim );
723 return 0 if not @{ $chain };
727 map { $match_tot += $_->[ LEN ] } @{ $chain };
729 $min = Maasha::Calc::min( length( ${ $q_seq } ), length( ${ $s_seq } ) );
731 $sim = sprintf( "%.2f", ( $match_tot / $min ) * 100 );
739 # Martin A. Hansen, June 2006.
741 # Given an alignment as a list of FASTA entries,
742 # generates a consensus sequences based on the
743 # entropies for each column similar to the way
744 # a sequence logo i calculated. Returns the
745 # consensus sequence as a FASTA entry.
747 my ( $entries, # list of aligned FASTA entries
748 $type, # residue type - OPTIONAL
749 $min_sim, # minimum similarity - OPTIONAL
754 my ( $bit_max, $data, $pos, $char, $score, $entry );
756 $type ||= Maasha::Seq::seq_guess_type( $entries->[ 0 ] );
759 if ( $type =~ /protein/ ) {
765 $data = Maasha::Seq::seqlogo_calc( $bit_max, $entries );
767 foreach $pos ( @{ $data } )
769 ( $char, $score ) = @{ $pos->[ -1 ] };
771 if ( ( $score / $bit_max ) * 100 >= $min_sim ) {
772 $entry->[ SEQ ] .= $char;
774 $entry->[ SEQ ] .= "-";
778 $entry->[ HEAD ] = "Consensus: $min_sim%";
780 return wantarray ? @{ $entry } : $entry;
784 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<