1 package Maasha::AlignTwoSeq;
3 # Copyright (C) 2007 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
34 use Storable qw( dclone );
37 use vars qw ( @ISA @EXPORT );
51 @ISA = qw( Exporter );
54 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
59 # Martin A. Hansen, August 2006.
61 # Generates an alignment by chaining matches, which are subsequences
62 # shared between two sequences. The routine functions by considering
63 # only matches within a given search space. If no matches are given
64 # these will be generated, if long matches are found these will be
65 # included in the alignment, otherwise matches will be included depending
66 # on a calculated score. New search spaces spanning the spaces between
67 # matches and the search space boundaries will be cast and recursed into.
69 my ( $q_seq, # sequence 1 ref
70 $s_seq, # sequence 2 ref
71 $matches, # list of matches
72 $q_min, # q sequence start position
73 $q_max, # q sequecne stop position
74 $s_min, # s sequence start position
75 $s_max, # s sequecne stop position
78 # returns a chain of matches that can be chained into an alignment
83 $q_max ||= length( ${ $q_seq } ) - 1;
84 $s_max ||= length( ${ $s_seq } ) - 1;
86 my ( $wordsize, @chain, $match, $best_match, @long_matches );
88 $matches = select_matches( $matches, $q_min, $q_max, $s_min, $s_max );
90 if ( scalar @{ $matches } == 0 ) # no matches - find some!
92 $wordsize = find_wordsize( $q_min, $q_max, $s_min, $s_max );
93 $matches = find_matches( $q_seq, $s_seq, $wordsize, $q_min, $q_max, $s_min, $s_max );
95 while ( scalar @{ $matches } == 0 and $wordsize > 1 )
98 $matches = find_matches( $q_seq, $s_seq, $wordsize, $q_min, $q_max, $s_min, $s_max );
101 if ( scalar @{ $matches } > 0 ) {
102 push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $q_max, $s_min, $s_max );
105 else # matches are included according to score
107 foreach $match ( @{ $matches } ) {
108 $match->[ SCORE ] = score_match( $match, $q_min, $q_max, $s_min, $s_max );
111 @{ $matches } = grep { $_->[ SCORE ] > 0 } @{ $matches };
112 @{ $matches } = sort { $b->[ SCORE ] <=> $a->[ SCORE ] } @{ $matches };
114 $best_match = shift @{ $matches };
118 push @chain, $best_match;
120 if ( $best_match->[ Q_BEG ] - $q_min >= 2 and $best_match->[ S_BEG ] - $s_min >= 2 ) {
121 push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $best_match->[ Q_BEG ] - 1, $s_min, $best_match->[ S_BEG ] - 1 ); # left search space
124 if ( $q_max - $best_match->[ Q_END ] >= 2 and $s_max - $best_match->[ S_END ] >= 2 ) {
125 push @chain, align_two_seq( $q_seq, $s_seq, $matches, $best_match->[ Q_END ] + 1, $q_max, $best_match->[ S_END ] + 1, $s_max ); # right search space
130 return wantarray ? @chain : \@chain;
136 # Martin A. Hansen, August 2006.
138 # Given a list of matches and a search space,
139 # include only those matches contained within
142 my ( $matches, # list of matches
143 $q_min, # q sequence start position
144 $q_max, # q sequecne stop position
145 $s_min, # s sequence start position
146 $s_max, # s sequecne stop position
149 # returns list of matches
153 @matches = grep { $_->[ Q_BEG ] >= $q_min and
154 $_->[ S_BEG ] >= $s_min and
155 $_->[ Q_END ] <= $q_max and
156 $_->[ S_END ] <= $s_max } @{ $matches };
158 return wantarray ? @matches : \@matches;
164 # Martin A. Hansen, August 2006.
166 # Given a search space calculates the wordsize for a word so a match
167 # occurs only a few times. More matches may be needed at low similarity in
168 # order to avoid starting with a wrong match.
170 my ( $q_min, # q sequence start position
171 $q_max, # q sequecne stop position
172 $s_min, # s sequence start position
173 $s_max, # s sequecne stop position
178 my ( $q_dim, $s_dim, $dim_min, $wordsize );
180 $q_dim = $q_max - $q_min + 1;
181 $s_dim = $s_max - $s_min + 1;
183 $dim_min = Maasha::Calc::min( $q_dim, $s_dim );
187 if ( $dim_min > 2000000 ) # optimized for DNA
191 elsif ( $dim_min > 100000 ) # optimized for DNA
195 elsif ( $q_dim > 100 or $s_dim > 100 ) # optimized for DNA
197 while ( ALPH_LEN ** $wordsize <= $q_dim * $s_dim and $wordsize < $dim_min ) {
203 while ( ALPH_LEN ** $wordsize <= $dim_min and $wordsize < $dim_min ) {
214 # Martin A. Hansen, November 2006
216 # given two sequences, find all maximum expanded matches between these
218 my ( $q_seq, # sequence 1
220 $wordsize, # word size
221 $q_min, # q sequence start position
222 $q_max, # q sequecne stop position
223 $s_min, # s sequence start position
224 $s_max, # s sequecne stop position
227 # returns list of matches
229 my ( $q_beg, $q_word, %word_hash, $s_beg, $s_word, $match, @matches );
231 if ( length ${ $s_seq } > length ${ $q_seq } )
233 for ( $q_beg = $q_min; $q_beg <= $q_max - $wordsize + 1; $q_beg++ )
235 $q_word = lc substr ${ $q_seq }, $q_beg, $wordsize;
237 next if $q_word =~ /n/i; # DNA/genome optimization
239 push @{ $word_hash{ $q_word } }, $q_beg;
242 for ( $s_beg = $s_min; $s_beg <= $s_max - $wordsize + 1; $s_beg++ )
244 $s_word = lc substr ${ $s_seq }, $s_beg, $wordsize;
246 if ( exists $word_hash{ $s_word } )
248 foreach $q_beg ( @{ $word_hash{ $s_word } } )
250 $match = [ $q_beg, $q_beg + $wordsize - 1, $s_beg, $s_beg + $wordsize - 1 ];
252 if ( grep { $match->[ Q_BEG ] >= $_->[ Q_BEG ] and
253 $match->[ Q_END ] <= $_->[ Q_END ] and
254 $match->[ S_BEG ] >= $_->[ S_BEG ] and
255 $match->[ S_END ] <= $_->[ S_END ] } @matches )
257 next; # match is redundant
261 $match = expand_match( $q_seq, $s_seq, $match, $q_max, $q_min, $s_max, $s_min );
262 $match->[ LEN ] = $match->[ Q_END ] - $match->[ Q_BEG ] + 1;
264 push @matches, $match;
272 for ( $s_beg = $s_min; $s_beg <= $s_max - $wordsize + 1; $s_beg++ )
274 $s_word = lc substr ${ $s_seq }, $s_beg, $wordsize;
276 next if $s_word =~ /n/i; # DNA/genome optimization
278 push @{ $word_hash{ $s_word } }, $s_beg;
281 for ( $q_beg = $q_min; $q_beg <= $q_max - $wordsize + 1; $q_beg++ )
283 $q_word = lc substr ${ $q_seq }, $q_beg, $wordsize;
285 if ( exists $word_hash{ $q_word } )
287 foreach $s_beg ( @{ $word_hash{ $q_word } } )
289 $match = [ $q_beg, $q_beg + $wordsize - 1, $s_beg, $s_beg + $wordsize - 1 ];
291 if ( grep { $match->[ Q_BEG ] >= $_->[ Q_BEG ] and
292 $match->[ Q_END ] <= $_->[ Q_END ] and
293 $match->[ S_BEG ] >= $_->[ S_BEG ] and
294 $match->[ S_END ] <= $_->[ S_END ] } @matches )
296 next; # match is redundant
300 $match = expand_match( $q_seq, $s_seq, $match, $q_max, $q_min, $s_max, $s_min );
301 $match->[ LEN ] = $match->[ Q_END ] - $match->[ Q_BEG ] + 1;
303 push @matches, $match;
310 return wantarray ? @matches : \@matches;
316 # Martin A. Hansen, August 2006.
318 # Given two sequences and a match, expand the match maximally.
319 # A match is defined like this: [ Q_BEG, Q_END, S_BEG, S_END ]
321 my ( $q_seq, # sequence 1 ref
322 $s_seq, # sequence 2 ref
323 $match, # sequence match
324 $q_max, # q sequecne stop position
325 $q_min, # q sequence start position
326 $s_max, # s sequecne stop position
327 $s_min, # s sequence start position
332 my ( $q_pos, $s_pos );
336 $q_pos = $match->[ Q_END ] + 1;
337 $s_pos = $match->[ S_END ] + 1;
339 while ( $q_pos <= $q_max and $s_pos <= $s_max and lc substr( ${ $q_seq }, $q_pos, 1 ) eq lc substr( ${ $s_seq }, $s_pos, 1 ) )
345 $match->[ Q_END ] = $q_pos - 1;
346 $match->[ S_END ] = $s_pos - 1;
348 # expanding backwards
350 $q_pos = $match->[ Q_BEG ] - 1;
351 $s_pos = $match->[ S_BEG ] - 1;
353 while ( $q_pos >= $q_min and $s_pos >= $s_min and lc substr( ${ $q_seq }, $q_pos, 1 ) eq lc substr( ${ $s_seq }, $s_pos, 1 ) )
359 $match->[ Q_BEG ] = $q_pos + 1;
360 $match->[ S_BEG ] = $s_pos + 1;
368 # Martin A. Hansen, August 2006
370 # given a match and a search space scores the match according to three criteria:
372 # 1) length of match - the longer the better.
373 # 2) distance to closest corner - the shorter the better.
374 # 3) distance to closest narrow end of the search space - the shorter the better.
376 # each of these scores are divided by search space dimentions, and the total score
377 # is calculated: total = score_len - score_corner - score_narrow
379 # the higher the score, the better the match.
382 $q_min, # q sequence start position
383 $q_max, # q sequecne stop position
384 $s_min, # s sequence start position
385 $s_max, # s sequecne stop position
388 # returns a positive number
390 my ( $q_dim, $s_dim, $score_len, $score_corner, $score_narrow, $score_total, $beg_diag_dist, $end_diag_dist,
391 $min_diag_dist, $score_diag, $beg_narrow_dist, $end_narrow_dist, $max_narrow_dist );
393 # ----- 1) scoring according to match length
395 $score_len = $match->[ LEN ] ** 3;
397 # ---- 2) score according to distance away from diagonal
399 $q_dim = $q_max - $q_min + 1;
400 $s_dim = $s_max - $s_min + 1;
402 if ( $q_dim >= $s_dim ) # s_dim is the narrow end
404 $beg_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_min, $s_min, $q_min + $s_dim, $s_min + $s_dim );
405 $end_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_max - $s_dim, $s_max - $s_dim, $q_max, $s_max );
409 $beg_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_min, $s_min, $q_min + $q_dim, $s_min + $q_dim );
410 $end_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_max - $q_dim, $s_max - $q_dim, $q_max, $s_max );
413 $min_diag_dist = Maasha::Calc::min( $beg_diag_dist, $end_diag_dist );
415 $score_diag = 2 * $min_diag_dist ** 2;
417 # ----- 3) scoring according to distance to the narrow end of the search space
419 if ( $q_dim > $s_dim ) # s_dim is the narrow end
421 $beg_narrow_dist = $match->[ Q_BEG ] - $q_min;
422 $end_narrow_dist = $q_max - $match->[ Q_BEG ];
424 $max_narrow_dist = Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist );
426 elsif ( $q_dim < $s_dim )
428 $beg_narrow_dist = $match->[ S_BEG ] - $s_min;
429 $end_narrow_dist = $s_max - $match->[ S_BEG ];
431 $max_narrow_dist = Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist );
435 $max_narrow_dist = 0;
438 $score_narrow = $max_narrow_dist;
440 $score_total = $score_len - $score_narrow - $score_diag;
446 sub score_match_niels
448 # Niels Larsen, June 2004.
450 # Creates a crude "heuristic" attempt of telling how likely it is that a
451 # given match occurs by chance in a given search space. If sequences are
452 # given their composition is taken into account. The scoring punishes
453 # distance from diagonal(s) and distance from previous match(es). Scores
454 # range from zero and up, and lower is better.
456 my ( $match, # Match array
457 $q_seq, # Either q_seq or s_seq
458 $q_min, # Lower bound search area (query sequence)
459 $q_max, # Upper bound search area (query sequence)
460 $s_min, # Lower bound search area (subject sequence)
461 $s_max, # Upper bound search area (subject sequence)
464 # Returns a positive number.
466 my ( $q_beg, $s_beg, $q_end, $s_end, $q_dim, $s_dim, $seq, $pos,
467 $q_delta_beg, $s_delta_beg, $q_delta_end, $s_delta_end, $i,
468 $delta_beg_max, $delta_end_max, $as, $gs, $ts, $cs, $pmatch,
469 $score, $moves, $dist_beg, $dist_end, $seqlen, %chars, $delta,
472 $q_beg = $match->[Q_BEG];
473 $q_end = $match->[Q_END];
474 $s_beg = $match->[S_BEG];
475 $s_end = $match->[S_END];
477 # >>>>>>>>>>>>>>>>>>>>>>> CRUDE INITIAL SCORE <<<<<<<<<<<<<<<<<<<<<<
479 # Get the probability of a match from the sequence composition (when
480 # match is longer than 20 and sequence is given, otherwise use 0.25)
481 # and raise that to the power of the length.
483 if ( $match->[LEN] > 20 and defined $q_seq )
485 $seq = substr ${ $q_seq }, $q_beg, $q_end-$q_beg+1;
486 $seqlen = length $seq;
488 $chars{"a"} = $chars{"g"} = $chars{"c"} = $chars{"t"} = 0;
490 for ( $i = 0; $i < $seqlen; $i++ )
492 $chars{ substr $seq, $i, 1 }++;
495 $pmatch = ($chars{"a"}/$seqlen)**2 + ($chars{"c"}/$seqlen)**2
496 + ($chars{"g"}/$seqlen)**2 + ($chars{"t"}/$seqlen)**2;
502 $score = $pmatch ** ( $q_end - $q_beg + 1 );
504 # # Punish by difference in height and width of search space,
506 # $q_dim = $q_max - $q_min + 1;
507 # $s_dim = $s_max - $s_min + 1;
509 # if ( $q_dim != $s_dim ) {
510 # $score *= abs ( $q_dim - $s_dim ) ** 2;
513 return $score if $score > 0.25;
515 # Punish by how far the match is to the closest corner of the search
518 $q_delta_beg = $q_beg - $q_min;
519 $s_delta_beg = $s_beg - $s_min;
521 $q_delta_end = $q_max - $q_end;
522 $s_delta_end = $s_max - $s_end;
524 if ( $q_delta_beg > $s_delta_beg ) {
525 $delta_beg_max = $q_delta_beg;
527 $delta_beg_max = $s_delta_beg;
530 if ( $q_delta_end > $s_delta_end ) {
531 $delta_end_max = $q_delta_end;
533 $delta_end_max = $s_delta_end;
536 if ( $delta_beg_max <= $delta_end_max ) {
537 $score *= ($delta_beg_max+1) ** 2.0;
539 $score *= ($delta_end_max+1) ** 2.0;
542 return $score if $score > 0.25;
544 # Add penalty if the match is towards the narrow end of the
547 if ( ($q_max - $q_min) <= ($s_max - $s_min) )
549 if ( $q_delta_beg > $s_delta_beg )
551 $score *= 2 * ( $q_delta_beg - $s_delta_beg ) ** 3;
553 elsif ( $q_delta_end > $s_delta_end )
555 $score *= 2 * ( $q_delta_end - $s_delta_end ) ** 3;
560 if ( $s_delta_beg > $q_delta_beg )
562 $score *= 2 * ( $s_delta_beg - $q_delta_beg ) ** 3;
564 elsif ( $s_delta_end > $q_delta_end )
566 $score *= 2 * ( $s_delta_end - $q_delta_end ) ** 3;
571 print STDERR "q_min, q_max, s_min, s_max: $q_min, $q_max, $s_min, $s_max\n";
572 die qq (Score <= 0 -> $score);
581 # Martin A. Hansen, June 2009.
583 # Given a list of matches and references to q_seq and s_seq,
584 # insert indels to form aligned sequences.
586 my ( $matches, # list of matches
587 $q_seq, # ref to query sequence
588 $s_seq, # ref to subject sequence
593 my ( $q_indels, $s_indels, $match, $diff, $first );
595 @{ $matches } = sort { $a->[ Q_BEG ] <=> $b->[ Q_BEG ] } @{ $matches };
601 # ---- initial indels ----
603 $match = shift @{ $matches };
605 substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ], uc substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ];
606 substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ], uc substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ];
608 $diff = ( $match->[ Q_BEG ] + $q_indels ) - ( $match->[ S_BEG ] + $s_indels );
612 substr ${ $q_seq }, 0, 0, '-' x abs $diff;
614 $q_indels += abs $diff;
618 substr ${ $s_seq }, 0, 0, '-' x abs $diff;
620 $s_indels += abs $diff;
623 # ---- internal indels ----
625 foreach $match ( @{ $matches } )
627 substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ], uc substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ];
628 substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ], uc substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ];
630 $diff = ( $match->[ Q_BEG ] + $q_indels ) - ( $match->[ S_BEG ] + $s_indels );
634 substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, 0, '-' x abs $diff;
636 $q_indels += abs $diff;
640 substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, 0, '-' x abs $diff;
642 $s_indels += abs $diff;
646 # ---- last indels ----
648 $diff = length( ${ $s_seq } ) - length( ${ $q_seq } );
651 ${ $q_seq } .= '-' x abs $diff;
653 ${ $s_seq } .= '-' x abs $diff;
660 # Martin A. Hansen, May 2007.
662 # Calculate the local similarity of an alignment based on
663 # an alignment chain. The similarity is calculated as
664 # the number of matching residues divided by the overall
665 # length of the alignment chain. This means that a short
666 # but "good" alignment will yield a high similarity, while
667 # a long "poor" alignment will yeild a low similarity.
669 my ( $chain, # list of matches in alignment
674 my ( $match, $match_tot, $q_beg, $q_end, $s_beg, $s_end, $q_diff, $s_diff, $max, $sim );
676 return 0 if not @{ $chain };
684 foreach $match ( @{ $chain } )
686 $match_tot += $match->[ LEN ];
688 $q_beg = $match->[ Q_BEG ] if $match->[ Q_BEG ] < $q_beg;
689 $s_beg = $match->[ S_BEG ] if $match->[ S_BEG ] < $s_beg;
691 $q_end = $match->[ Q_END ] if $match->[ Q_END ] > $q_end;
692 $s_end = $match->[ S_END ] if $match->[ S_END ] > $s_end;
695 $q_diff = $q_end - $q_beg + 1;
696 $s_diff = $s_end - $s_beg + 1;
698 $max = Maasha::Calc::max( $q_diff, $s_diff );
700 $sim = sprintf( "%.2f", ( $match_tot / $max ) * 100 );
708 # Martin A. Hansen, June 2007.
710 # Calculate the global similarity of an alignment based on
711 # an alignment chain. The similarity is calculated as
712 # the number of matching residues divided by the
713 # length of the shortest sequence.
715 my ( $chain, # list of matches in alignment
716 $q_seq, # ref to query sequence
717 $s_seq, # ref to subject sequence
722 my ( $match_tot, $min, $sim );
724 return 0 if not @{ $chain };
728 map { $match_tot += $_->[ LEN ] } @{ $chain };
730 $min = Maasha::Calc::min( length( ${ $q_seq } ), length( ${ $s_seq } ) );
732 $sim = sprintf( "%.2f", ( $match_tot / $min ) * 100 );
740 # Martin A. Hansen, June 2006.
742 # Given an alignment as a list of FASTA entries,
743 # generates a consensus sequences based on the
744 # entropies for each column similar to the way
745 # a sequence logo i calculated. Returns the
746 # consensus sequence as a FASTA entry.
748 my ( $entries, # list of aligned FASTA entries
749 $type, # residue type - OPTIONAL
750 $min_sim, # minimum similarity - OPTIONAL
755 my ( $bit_max, $data, $pos, $char, $score, $entry );
757 $type ||= Maasha::Seq::seq_guess_type( $entries->[ 0 ] );
760 if ( $type =~ /protein/ ) {
766 $data = Maasha::Seq::seqlogo_calc( $bit_max, $entries );
768 foreach $pos ( @{ $data } )
770 ( $char, $score ) = @{ $pos->[ -1 ] };
772 if ( ( $score / $bit_max ) * 100 >= $min_sim ) {
773 $entry->[ SEQ ] .= $char;
775 $entry->[ SEQ ] .= "-";
779 $entry->[ HEAD ] = "Consensus: $min_sim%";
781 return wantarray ? @{ $entry } : $entry;
785 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<