3 # Copyright (C) 2007 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
33 use Storable qw( dclone );
37 use vars qw ( @ISA @EXPORT );
50 @ISA = qw( Exporter );
53 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
58 # Martin A. Hansen, August 2006.
60 # Generates an alignment by chaining matches, which are subsequences
61 # shared between two sequences. The routine functions by considering
62 # only matches within a given search space. If no matches are given
63 # these will be generated, if long matches are found these will be
64 # included in the alignment, otherwise matches will be included depending
65 # on a calculated score. New search spaces spanning the spaces between
66 # matches and the search space boundaries will be cast and recursed into.
68 my ( $q_seq, # sequence 1 ref
69 $s_seq, # sequence 2 ref
70 $matches, # list of matches
71 $q_min, # q sequence start position
72 $q_max, # q sequecne stop position
73 $s_min, # s sequence start position
74 $s_max, # s sequecne stop position
75 $args, # argument hash
78 # returns a chain of matches that can be chained into an alignment
83 $q_max ||= length( ${ $q_seq } ) - 1;
84 $s_max ||= length( ${ $s_seq } ) - 1;
86 $args->{ "long_matches" } ||= 50;
87 $args->{ "alph_len" } ||= 4;
89 my ( $wordsize, @chain, $match, $best_match, @long_matches );
91 $matches = select_matches( $matches, $q_min, $q_max, $s_min, $s_max );
93 if ( scalar @{ $matches } == 0 ) # no matches - find some!
95 # $wordsize = find_wordsize( $q_min, $q_max, $s_min, $s_max, $args );
97 $matches = find_matches( $q_seq, $s_seq, $wordsize, $q_min, $q_max, $s_min, $s_max );
99 while ( scalar @{ $matches } == 0 and $wordsize > 1 )
102 $matches = find_matches( $q_seq, $s_seq, $wordsize, $q_min, $q_max, $s_min, $s_max );
105 if ( scalar @{ $matches } > 0 ) {
106 push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $q_max, $s_min, $s_max, $args );
109 elsif ( @long_matches = grep { $_->[ LEN ] >= $args->{ "long_matches" } } @{ $matches } ) # long matches found - include all that don't overlap!
111 @long_matches = order_matches( \@long_matches );
113 foreach $match ( @long_matches )
117 if ( $match->[ Q_BEG ] - $q_min >= 2 and $match->[ S_BEG ] - $s_min >= 2 ) {
118 push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $match->[ Q_BEG ] - 1, $s_min, $match->[ S_BEG ] - 1, $args ); # intermediate search space
121 $q_min = $match->[ Q_END ] + 1;
122 $s_min = $match->[ S_END ] + 1;
125 if ( $q_min + 1 < $q_max and $s_min + 1 < $s_max ) {
126 push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $q_max, $s_min, $s_max, $args ); # remaining search space
129 else # shorter matches are included according to score
131 foreach $match ( @{ $matches } ) {
132 # $match->[ SCORE ] = score_match( $match, $q_min, $q_max, $s_min, $s_max );
133 $match->[ SCORE ] = score_match_niels( $match, $q_seq, $q_min, $q_max, $s_min, $s_max );
136 # @{ $matches } = grep { $_->[ SCORE ] > 0 } @{ $matches };
137 @{ $matches } = grep { $_->[ SCORE ] <= 0.25 } @{ $matches };
138 # @{ $matches } = sort { $b->[ SCORE ] <=> $a->[ SCORE ] } @{ $matches };
139 @{ $matches } = sort { $a->[ SCORE ] <=> $b->[ SCORE ] } @{ $matches };
141 $best_match = shift @{ $matches };
145 push @chain, $best_match;
147 if ( $best_match->[ Q_BEG ] - $q_min >= 2 and $best_match->[ S_BEG ] - $s_min >= 2 ) {
148 push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $best_match->[ Q_BEG ] - 1, $s_min, $best_match->[ S_BEG ] - 1, $args ); # left search space
151 if ( $q_max - $best_match->[ Q_END ] >= 2 and $s_max - $best_match->[ S_END ] >= 2 ) {
152 push @chain, align_two_seq( $q_seq, $s_seq, $matches, $best_match->[ Q_END ] + 1, $q_max, $best_match->[ S_END ] + 1, $s_max, $args ); # right search space
157 return wantarray ? @chain : \@chain;
163 # Martin A. Hansen, August 2006.
165 # Given a list of matches and a search space,
166 # include only those matches contained within
169 my ( $matches, # list of matches
170 $q_min, # q sequence start position
171 $q_max, # q sequecne stop position
172 $s_min, # s sequence start position
173 $s_max, # s sequecne stop position
176 # returns list of matches
180 @matches = grep { $_->[ Q_BEG ] >= $q_min and
181 $_->[ S_BEG ] >= $s_min and
182 $_->[ Q_END ] <= $q_max and
183 $_->[ S_END ] <= $s_max } @{ $matches };
185 return wantarray ? @matches : \@matches;
191 # Martin A. Hansen, October 2006
193 # given a list of long matches, order these by length and position
194 # and include only those long matches that does not cross.
196 my ( $long_matches, # list of matches
199 # returns a list of matches
201 my ( @matches, $match, $i );
203 @{ $long_matches } = sort { $b->[ LEN ] <=> $a->[ LEN ] } @{ $long_matches };
205 @matches = shift @{ $long_matches };
207 foreach $match ( @{ $long_matches } )
209 if ( $match->[ Q_END ] < $matches[ 0 ]->[ Q_BEG ] and $match->[ S_END ] < $matches[ 0 ]->[ S_BEG ] )
211 unshift @matches, $match;
213 elsif ( $match->[ Q_BEG ] > $matches[ -1 ]->[ Q_END ] and $match->[ S_BEG ] > $matches[ -1 ]->[ S_END ] )
215 push @matches, $match;
219 for ( $i = 1; $i < @matches; $i++ )
221 if ( $matches[ $i - 1 ]->[ Q_END ] < $match->[ Q_BEG ] and $match->[ Q_END ] < $matches[ $i ]->[ Q_BEG ] and
222 $matches[ $i - 1 ]->[ S_END ] < $match->[ S_BEG ] and $match->[ S_END ] < $matches[ $i ]->[ S_BEG ]
225 splice @matches, $i, 0, dclone $match;
232 return wantarray ? @matches : \@matches;
238 # Martin A. Hansen, August 2006.
240 # Given a search space calculates the wordsize for a word so a match
241 # occurs only a few times. More matches may be needed at low similarity in
242 # order to avoid starting with a wrong match.
244 my ( $q_min, # q sequence start position
245 $q_max, # q sequecne stop position
246 $s_min, # s sequence start position
247 $s_max, # s sequecne stop position
248 $args, # argument hash
253 my ( $q_dim, $s_dim, $dim_min, $wordsize );
255 $q_dim = $q_max - $q_min + 1;
256 $s_dim = $s_max - $s_min + 1;
258 $dim_min = Maasha::Calc::min( $q_dim, $s_dim );
262 if ( $dim_min > 2000000 ) # optimized for DNA
264 $wordsize = $args->{ "long_matches" };
266 elsif ( $dim_min > 100000 ) # optimized for DNA
268 $wordsize = int( $args->{ "long_matches" } / 2 );
270 elsif ( $q_dim > 100 or $s_dim > 100 ) # optimized for DNA
272 while ( $args->{ "alph_len" } ** $wordsize <= $q_dim * $s_dim and $wordsize < $dim_min ) {
278 while ( $args->{ "alph_len" } ** $wordsize <= $dim_min and $wordsize < $dim_min ) {
289 # Martin A. Hansen, November 2006
291 # given two sequences, find all maximum expanded matches between these
293 my ( $q_seq, # sequence 1
295 $wordsize, # word size
296 $q_min, # q sequence start position
297 $q_max, # q sequecne stop position
298 $s_min, # s sequence start position
299 $s_max, # s sequecne stop position
302 # returns list of matches
304 my ( $q_beg, $q_word, %word_hash, $s_beg, $s_word, $match, @matches );
306 if ( length ${ $s_seq } > length ${ $q_seq } )
308 for ( $q_beg = $q_min; $q_beg <= $q_max - $wordsize + 1; $q_beg++ )
310 $q_word = lc substr ${ $q_seq }, $q_beg, $wordsize;
312 next if $q_word =~ /n/i; # DNA/genome optimization
314 push @{ $word_hash{ $q_word } }, $q_beg;
317 for ( $s_beg = $s_min; $s_beg <= $s_max - $wordsize + 1; $s_beg++ )
319 $s_word = lc substr ${ $s_seq }, $s_beg, $wordsize;
321 if ( exists $word_hash{ $s_word } )
323 foreach $q_beg ( @{ $word_hash{ $s_word } } )
325 $match = [ $q_beg, $q_beg + $wordsize - 1, $s_beg, $s_beg + $wordsize - 1 ];
327 if ( grep { $match->[ Q_BEG ] >= $_->[ Q_BEG ] and
328 $match->[ Q_END ] <= $_->[ Q_END ] and
329 $match->[ S_BEG ] >= $_->[ S_BEG ] and
330 $match->[ S_END ] <= $_->[ S_END ] } @matches )
332 next; # match is redundant
336 $match = expand_match( $q_seq, $s_seq, $match, $q_max, $q_min, $s_max, $s_min );
337 $match->[ LEN ] = $match->[ Q_END ] - $match->[ Q_BEG ] + 1;
339 push @matches, $match;
347 for ( $s_beg = $s_min; $s_beg <= $s_max - $wordsize + 1; $s_beg++ )
349 $s_word = lc substr ${ $s_seq }, $s_beg, $wordsize;
351 next if $s_word =~ /n/i; # DNA/genome optimization
353 push @{ $word_hash{ $s_word } }, $s_beg;
356 for ( $q_beg = $q_min; $q_beg <= $q_max - $wordsize + 1; $q_beg++ )
358 $q_word = lc substr ${ $q_seq }, $q_beg, $wordsize;
360 if ( exists $word_hash{ $q_word } )
362 foreach $s_beg ( @{ $word_hash{ $q_word } } )
364 $match = [ $q_beg, $q_beg + $wordsize - 1, $s_beg, $s_beg + $wordsize - 1 ];
366 if ( grep { $match->[ Q_BEG ] >= $_->[ Q_BEG ] and
367 $match->[ Q_END ] <= $_->[ Q_END ] and
368 $match->[ S_BEG ] >= $_->[ S_BEG ] and
369 $match->[ S_END ] <= $_->[ S_END ] } @matches )
371 next; # match is redundant
375 $match = expand_match( $q_seq, $s_seq, $match, $q_max, $q_min, $s_max, $s_min );
376 $match->[ LEN ] = $match->[ Q_END ] - $match->[ Q_BEG ] + 1;
378 push @matches, $match;
385 return wantarray ? @matches : \@matches;
391 # Martin A. Hansen, August 2006.
393 # Given two sequences and a match, expand the match maximally.
394 # A match is defined like this: [ Q_BEG, Q_END, S_BEG, S_END ]
396 my ( $q_seq, # sequence 1 ref
397 $s_seq, # sequence 2 ref
398 $match, # sequence match
399 $q_max, # q sequecne stop position
400 $q_min, # q sequence start position
401 $s_max, # s sequecne stop position
402 $s_min, # s sequence start position
407 my ( $q_pos, $s_pos );
411 $q_pos = $match->[ Q_END ] + 1;
412 $s_pos = $match->[ S_END ] + 1;
414 while ( $q_pos <= $q_max and $s_pos <= $s_max and lc substr( ${ $q_seq }, $q_pos, 1 ) eq lc substr( ${ $s_seq }, $s_pos, 1 ) )
420 $match->[ Q_END ] = $q_pos - 1;
421 $match->[ S_END ] = $s_pos - 1;
423 # expanding backwards
425 $q_pos = $match->[ Q_BEG ] - 1;
426 $s_pos = $match->[ S_BEG ] - 1;
428 while ( $q_pos >= $q_min and $s_pos >= $s_min and lc substr( ${ $q_seq }, $q_pos, 1 ) eq lc substr( ${ $s_seq }, $s_pos, 1 ) )
434 $match->[ Q_BEG ] = $q_pos + 1;
435 $match->[ S_BEG ] = $s_pos + 1;
443 # Martin A. Hansen, August 2006
445 # given a match and a search space scores the match according to three criteria:
447 # 1) length of match - the longer the better.
448 # 2) distance to closest corner - the shorter the better.
449 # 3) distance to closest narrow end of the search space - the shorter the better.
451 # each of these scores are divided by search space dimentions, and the total score
452 # is calculated: total = score_len - score_corner - score_narrow
454 # the higher the score, the better the match.
457 $q_min, # q sequence start position
458 $q_max, # q sequecne stop position
459 $s_min, # s sequence start position
460 $s_max, # s sequecne stop position
463 # returns a positive number
465 my ( $q_dim, $s_dim, $score_len, $score_corner, $score_narrow, $score_total, $beg_diag_dist, $end_diag_dist,
466 $min_diag_dist, $score_diag, $beg_narrow_dist, $end_narrow_dist, $max_narrow_dist );
468 # ----- 1) scoring according to match length
470 $score_len = $match->[ LEN ] ** 3;
472 # ---- 2) score according to distance away from diagonal
474 $q_dim = $q_max - $q_min + 1;
475 $s_dim = $s_max - $s_min + 1;
477 if ( $q_dim >= $s_dim ) # s_dim is the narrow end
479 $beg_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_min, $s_min, $q_min + $s_dim, $s_min + $s_dim );
480 $end_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_max - $s_dim, $s_max - $s_dim, $q_max, $s_max );
484 $beg_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_min, $s_min, $q_min + $q_dim, $s_min + $q_dim );
485 $end_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_max - $q_dim, $s_max - $q_dim, $q_max, $s_max );
488 $min_diag_dist = Maasha::Calc::min( $beg_diag_dist, $end_diag_dist );
490 $score_diag = 2 * $min_diag_dist ** 2;
492 # ----- 3) scoring according to distance to the narrow end of the search space
494 if ( $q_dim > $s_dim ) # s_dim is the narrow end
496 $beg_narrow_dist = $match->[ Q_BEG ] - $q_min;
497 $end_narrow_dist = $q_max - $match->[ Q_BEG ];
499 $max_narrow_dist = Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist );
501 elsif ( $q_dim < $s_dim )
503 $beg_narrow_dist = $match->[ S_BEG ] - $s_min;
504 $end_narrow_dist = $s_max - $match->[ S_BEG ];
506 $max_narrow_dist = Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist );
510 $max_narrow_dist = 0;
513 $score_narrow = $max_narrow_dist;
515 $score_total = $score_len - $score_narrow - $score_diag;
516 # $score_total = -1 if 3 * $min_diag_dist > $match->[ LEN ];
522 sub score_match_niels
524 # Niels Larsen, June 2004.
526 # Creates a crude "heuristic" attempt of telling how likely it is that a
527 # given match occurs by chance in a given search space. If sequences are
528 # given their composition is taken into account. The scoring punishes
529 # distance from diagonal(s) and distance from previous match(es). Scores
530 # range from zero and up, and lower is better.
532 my ( $match, # Match array
533 $q_seq, # Either q_seq or s_seq
534 $q_min, # Lower bound search area (query sequence)
535 $q_max, # Upper bound search area (query sequence)
536 $s_min, # Lower bound search area (subject sequence)
537 $s_max, # Upper bound search area (subject sequence)
540 # Returns a positive number.
542 my ( $q_beg, $s_beg, $q_end, $s_end, $q_dim, $s_dim, $seq, $pos,
543 $q_delta_beg, $s_delta_beg, $q_delta_end, $s_delta_end, $i,
544 $delta_beg_max, $delta_end_max, $as, $gs, $ts, $cs, $pmatch,
545 $score, $moves, $dist_beg, $dist_end, $seqlen, %chars, $delta,
548 $q_beg = $match->[Q_BEG];
549 $q_end = $match->[Q_END];
550 $s_beg = $match->[S_BEG];
551 $s_end = $match->[S_END];
553 # >>>>>>>>>>>>>>>>>>>>>>> CRUDE INITIAL SCORE <<<<<<<<<<<<<<<<<<<<<<
555 # Get the probability of a match from the sequence composition (when
556 # match is longer than 20 and sequence is given, otherwise use 0.25)
557 # and raise that to the power of the length.
559 if ( $match->[LEN] > 20 and defined $q_seq )
561 $seq = substr ${ $q_seq }, $q_beg, $q_end-$q_beg+1;
562 $seqlen = length $seq;
564 $chars{"a"} = $chars{"g"} = $chars{"c"} = $chars{"t"} = 0;
566 for ( $i = 0; $i < $seqlen; $i++ )
568 $chars{ substr $seq, $i, 1 }++;
571 $pmatch = ($chars{"a"}/$seqlen)**2 + ($chars{"c"}/$seqlen)**2
572 + ($chars{"g"}/$seqlen)**2 + ($chars{"t"}/$seqlen)**2;
578 $score = $pmatch ** ( $q_end - $q_beg + 1 );
580 # # Punish by difference in height and width of search space,
582 # $q_dim = $q_max - $q_min + 1;
583 # $s_dim = $s_max - $s_min + 1;
585 # if ( $q_dim != $s_dim ) {
586 # $score *= abs ( $q_dim - $s_dim ) ** 2;
589 return $score if $score > 0.25;
591 # Punish by how far the match is to the closest corner of the search
594 $q_delta_beg = $q_beg - $q_min;
595 $s_delta_beg = $s_beg - $s_min;
597 $q_delta_end = $q_max - $q_end;
598 $s_delta_end = $s_max - $s_end;
600 if ( $q_delta_beg > $s_delta_beg ) {
601 $delta_beg_max = $q_delta_beg;
603 $delta_beg_max = $s_delta_beg;
606 if ( $q_delta_end > $s_delta_end ) {
607 $delta_end_max = $q_delta_end;
609 $delta_end_max = $s_delta_end;
612 if ( $delta_beg_max <= $delta_end_max ) {
613 $score *= ($delta_beg_max+1) ** 2.0;
615 $score *= ($delta_end_max+1) ** 2.0;
618 return $score if $score > 0.25;
620 # Add penalty if the match is towards the narrow end of the
623 if ( ($q_max - $q_min) <= ($s_max - $s_min) )
625 if ( $q_delta_beg > $s_delta_beg )
627 $score *= 2 * ( $q_delta_beg - $s_delta_beg ) ** 3;
629 elsif ( $q_delta_end > $s_delta_end )
631 $score *= 2 * ( $q_delta_end - $s_delta_end ) ** 3;
636 if ( $s_delta_beg > $q_delta_beg )
638 $score *= 2 * ( $s_delta_beg - $q_delta_beg ) ** 3;
640 elsif ( $s_delta_end > $q_delta_end )
642 $score *= 2 * ( $s_delta_end - $q_delta_end ) ** 3;
647 print STDERR "q_min, q_max, s_min, s_max: $q_min, $q_max, $s_min, $s_max\n";
648 die qq (Score <= 0 -> $score);
657 # Martin A. Hansen, August 2006.
659 # Routine to print an alignment in fasta format based
660 # on a list of matches and two given sequences.
662 my ( $matches, # list of matches
663 $q_head, # query sequence head
664 $q_seq, # query sequence ref
665 $s_head, # subject sequence head
666 $s_seq, # subject sequence ref
669 my ( $q_pos, $s_pos, $q_nomatch, $q_match, $s_nomatch, $match, $q_aseq, $s_aseq, $i );
671 @{ $matches } = sort { $a->[ Q_BEG ] <=> $b->[ Q_BEG ] } @{ $matches };
676 for ( $i = 0; $i < @{ $matches }; $i++ )
678 $match = $matches->[ $i ];
680 $q_nomatch = $match->[ Q_BEG ] - $q_pos;
681 $s_nomatch = $match->[ S_BEG ] - $s_pos;
683 if ( $q_nomatch - $s_nomatch > 0 )
685 $s_aseq .= "-" x ( $q_nomatch - $s_nomatch );
686 $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ];
687 $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ];
689 elsif ( $s_nomatch - $q_nomatch > 0 )
691 $q_aseq .= "-" x ( $s_nomatch - $q_nomatch );
692 $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ];
693 $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ];
697 $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ];
698 $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ];
701 $q_pos += $q_nomatch + $match->[ LEN ];
702 $s_pos += $s_nomatch + $match->[ LEN ];
705 $match = $matches->[ -1 ] || [ 0, 0, 0, 0, 0 ];
707 $q_nomatch = length( ${ $q_seq } ) - $match->[ Q_END ];
708 $s_nomatch = length( ${ $s_seq } ) - $match->[ S_END ];
710 if ( $q_nomatch - $s_nomatch > 0 )
712 $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ];
713 $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ];
714 $s_aseq .= "-" x ( $q_nomatch - $s_nomatch );
716 elsif ( $s_nomatch - $q_nomatch > 0 )
718 $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ];
719 $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ];
720 $q_aseq .= "-" x ( $s_nomatch - $q_nomatch );
724 $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ];
725 $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ];
728 print ">$q_head\n$q_aseq\n>$s_head\n$s_aseq\n";
734 # Martin A. Hansen, February 2007.
736 my ( $matches, # list of matches
737 $q_head, # query sequence head
738 $q_seq, # query sequence ref
739 $s_head, # subject sequence head
740 $s_seq, # subject sequence ref
741 $args, # argument hash - OPTIONAL
744 $args->{ "wrap" } ||= 80;
746 my ( $q_pos, $s_pos, $match, $q_nomatch, $q_match, $s_nomatch, $q_aseq, $s_aseq, $pins, $i, $q, $s, $q_ruler, $s_ruler, $entries );
748 @{ $matches } = sort { $a->[ Q_BEG ] <=> $b->[ Q_BEG ] } @{ $matches };
753 for ( $i = 0; $i < @{ $matches }; $i++ )
755 $match = $matches->[ $i ];
757 $q_nomatch = $match->[ Q_BEG ] - $q_pos;
758 $s_nomatch = $match->[ S_BEG ] - $s_pos;
763 if ( $q_nomatch - $s_nomatch > 0 )
765 $q_aseq .= substr ${ $q_seq }, $q_pos, ( $q_nomatch - $s_nomatch );
766 $s_aseq .= "-" x ( $q_nomatch - $s_nomatch );
767 $pins .= " " x ( $q_nomatch - $s_nomatch );
768 $q += ( $q_nomatch - $s_nomatch );
770 elsif ( $s_nomatch - $q_nomatch > 0 )
772 $s_aseq .= substr ${ $s_seq }, $s_pos, ( $s_nomatch - $q_nomatch );
773 $q_aseq .= "-" x ( $s_nomatch - $q_nomatch );
774 $pins .= " " x ( $s_nomatch - $q_nomatch );
775 $s += ( $s_nomatch - $q_nomatch );
778 while ( $q < $q_pos + $q_nomatch and $s < $s_pos + $s_nomatch )
780 $q_aseq .= substr ${ $q_seq }, $q, 1;
781 $s_aseq .= substr ${ $s_seq }, $s, 1;
783 if ( substr( ${ $q_seq }, $q, 1 ) eq substr( ${ $s_seq }, $s, 1 ) )
794 $q_aseq .= substr ${ $q_seq }, $match->[ Q_BEG ], $match->[ LEN ];
795 $s_aseq .= substr ${ $s_seq }, $match->[ S_BEG ], $match->[ LEN ];
796 $pins .= "|" x $match->[ LEN ];
798 $q_pos += $q_nomatch + $match->[ LEN ];
799 $s_pos += $s_nomatch + $match->[ LEN ];
802 $q_nomatch = length( ${ $q_seq } ) - ( $match->[ Q_END ] || 0 );
803 $s_nomatch = length( ${ $s_seq } ) - ( $match->[ S_END ] || 0 );
808 while ( $q < $q_pos + $q_nomatch and $q < length ${ $q_seq } and $s < $s_pos + $s_nomatch and $s < length ${ $s_seq } )
810 $q_aseq .= substr ${ $q_seq }, $q, 1;
811 $s_aseq .= substr ${ $s_seq }, $s, 1;
813 if ( substr( ${ $q_seq }, $q, 1 ) eq substr( ${ $s_seq }, $s, 1 ) ) {
825 if ( $q_nomatch - $s_nomatch > 0 )
827 $q_aseq .= substr ${ $q_seq }, $q_pos, ( $q_nomatch - $s_nomatch );
828 $s_aseq .= "-" x ( $q_nomatch - $s_nomatch );
829 $pins .= " " x ( $q_nomatch - $s_nomatch );
831 elsif ( $s_nomatch - $q_nomatch > 0 )
833 $s_aseq .= substr ${ $s_seq }, $s_pos, ( $s_nomatch - $q_nomatch );
834 $q_aseq .= "-" x ( $s_nomatch - $q_nomatch );
835 $pins .= " " x ( $s_nomatch - $q_nomatch );
838 $q_ruler = make_ruler( $q_aseq );
839 $s_ruler = make_ruler( $s_aseq );
842 [ "ruler", $q_ruler ],
843 [ $q_head, $q_aseq ],
844 [ "consensus", $pins ],
845 [ $s_head, $s_aseq ],
846 [ "ruler", $s_ruler ],
849 align_print_multi( $entries, undef, $args->{ "wrap" } )
855 # Martin A. Hansen, February 2007;
857 # Gererates a ruler for a given sequence (with indels).
864 my ( $i, $char, $skip, $count, $gap, $tics );
869 while ( $i <= length $seq )
871 $char = substr $seq, $i - 1, 1;
873 $gap++ if $char eq "-";
882 $count = 1 if $char eq "-";
884 if ( $count % 100 == 0 )
886 if ( $count + length( $count ) >= length $seq )
892 $tics .= "|" . $count;
893 $skip = length $count;
896 elsif ( $count % 50 == 0 ) {
898 } elsif ( $count % 10 == 0 ) {
914 # Martin A. Hansen, May 2007.
916 # Calculate the local similarity of an alignment based on
917 # an alignment chain. The similarity is calculated as
918 # the number of matching residues divided by the overall
919 # length of the alignment chain. This means that a short
920 # but "good" alignment will yield a high similarity, while
921 # a long "poor" alignment will yeild a low similarity.
923 my ( $chain, # list of matches in alignment
928 my ( $match, $match_tot, $q_beg, $q_end, $s_beg, $s_end, $q_diff, $s_diff, $max, $sim );
930 return 0 if not @{ $chain };
938 foreach $match ( @{ $chain } )
940 $match_tot += $match->[ LEN ];
942 $q_beg = $match->[ Q_BEG ] if $match->[ Q_BEG ] < $q_beg;
943 $s_beg = $match->[ S_BEG ] if $match->[ S_BEG ] < $s_beg;
945 $q_end = $match->[ Q_END ] if $match->[ Q_END ] > $q_end;
946 $s_end = $match->[ S_END ] if $match->[ S_END ] > $s_end;
949 $q_diff = $q_end - $q_beg + 1;
950 $s_diff = $s_end - $s_beg + 1;
952 $max = Maasha::Calc::max( $q_diff, $s_diff );
954 $sim = sprintf( "%.2f", ( $match_tot / $max ) * 100 );
962 # Martin A. Hansen, June 2007.
964 # Calculate the global similarity of an alignment based on
965 # an alignment chain. The similarity is calculated as
966 # the number of matching residues divided by the
967 # length of the shortest sequence.
969 my ( $chain, # list of matches in alignment
970 $q_seq, # ref to query sequence
971 $s_seq, # ref to subject sequence
976 my ( $match_tot, $min, $sim );
978 return 0 if not @{ $chain };
982 map { $match_tot += $_->[ LEN ] } @{ $chain };
984 $min = Maasha::Calc::min( length( ${ $q_seq } ), length( ${ $s_seq } ) );
986 $sim = sprintf( "%.2f", ( $match_tot / $min ) * 100 );
994 # Martin A. Hansen, June 2006.
996 # Given an alignment as a list of FASTA entries,
997 # generates a consensus sequences based on the
998 # entropies for each column similar to the way
999 # a sequence logo i calculated. Returns the
1000 # consensus sequence as a FASTA entry.
1002 my ( $entries, # list of aligned FASTA entries
1003 $type, # residue type - OPTIONAL
1004 $min_sim, # minimum similarity - OPTIONAL
1009 my ( $bit_max, $data, $pos, $char, $score, $entry );
1011 $type ||= Maasha::Seq::seq_guess_type( $entries->[ 0 ] );
1014 if ( $type =~ /protein/ ) {
1020 $data = Maasha::Seq::seqlogo_calc( $bit_max, $entries );
1022 foreach $pos ( @{ $data } )
1024 ( $char, $score ) = @{ $pos->[ -1 ] };
1026 if ( ( $score / $bit_max ) * 100 >= $min_sim ) {
1027 $entry->[ SEQ ] .= $char;
1029 $entry->[ SEQ ] .= "-";
1033 $entry->[ HEAD ] = "Consensus: $min_sim%";
1035 return wantarray ? @{ $entry } : $entry;
1039 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<