code_perl/Maasha/AlignTwoSeq.pm

   1 package Maasha::AlignTwoSeq;
   2
   3 # Copyright (C) 2007 Martin A. Hansen.
   4
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; either version 2
   8 # of the License, or (at your option) any later version.
   9
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18
  19 # http://www.gnu.org/copyleft/gpl.html
  20
  21
  22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  23
  24
  25 # yak yak yak
  26
  27
  28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  29
  30
  31 use warnings;
  32 use strict;
  33 use Data::Dumper;
  34 use Storable qw( dclone );
  35 use Maasha::Calc;
  36 use Maasha::Seq;
  37 use vars qw ( @ISA @EXPORT );
  38
  39 use constant {
  40     Q_BEG    => 0,
  41     Q_END    => 1,
  42     S_BEG    => 2,
  43     S_END    => 3,
  44     LEN      => 4,
  45     SCORE    => 5,
  46     HEAD     => 0,
  47     SEQ      => 1,
  48     ALPH_LEN => 4,
  49 };
  50
  51 @ISA = qw( Exporter );
  52
  53
  54 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  55
  56
  57 sub align_two_seq
  58 {
  59     # Martin A. Hansen, August 2006.
  60
  61     # Generates an alignment by chaining matches, which are subsequences
  62     # shared between two sequences. The routine functions by considering
  63     # only matches within a given search space. If no matches are given
  64     # these will be generated, if long matches are found these will be
  65     # included in the alignment, otherwise matches will be included depending
  66     # on a calculated score. New search spaces spanning the spaces between
  67     # matches and the search space boundaries will be cast and recursed into.
  68
  69     my ( $q_seq,     # sequence 1 ref
  70          $s_seq,     # sequence 2 ref
  71          $matches,   # list of matches
  72          $q_min,     # q sequence start position
  73          $q_max,     # q sequecne stop  position
  74          $s_min,     # s sequence start position
  75          $s_max,     # s sequecne stop  position
  76        ) = @_;
  77
  78     # returns a chain of matches that can be chained into an alignment
  79
  80     $matches ||= [];
  81     $q_min   ||= 0;
  82     $s_min   ||= 0;
  83     $q_max   ||= length( ${ $q_seq } ) - 1;
  84     $s_max   ||= length( ${ $s_seq } ) - 1;
  85
  86     my ( $wordsize, @chain, $match, $best_match, @long_matches );
  87
  88     $matches = select_matches( $matches, $q_min, $q_max, $s_min, $s_max );
  89
  90     if ( scalar @{ $matches } == 0 )   # no matches - find some!
  91     {
  92         $wordsize = find_wordsize( $q_min, $q_max, $s_min, $s_max );
  93         $matches  = find_matches( $q_seq, $s_seq, $wordsize, $q_min, $q_max, $s_min, $s_max );
  94
  95         while ( scalar @{ $matches } == 0 and $wordsize > 1 )
  96         {
  97             $wordsize--;
  98             $matches = find_matches( $q_seq, $s_seq, $wordsize, $q_min, $q_max, $s_min, $s_max );
  99         }
 100
 101         if ( scalar @{ $matches } > 0 ) {
 102             push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $q_max, $s_min, $s_max );
 103         }
 104     }
 105     else   # matches are included according to score
 106     {
 107         foreach $match ( @{ $matches } ) {
 108             $match->[ SCORE ] = score_match( $match, $q_min, $q_max, $s_min, $s_max );
 109         }
 110
 111         @{ $matches } = grep { $_->[ SCORE ] > 0 } @{ $matches };
 112         @{ $matches } = sort { $b->[ SCORE ] <=> $a->[ SCORE ] } @{ $matches };
 113
 114         $best_match   = shift @{ $matches };
 115
 116         if ( $best_match )
 117         {
 118             push @chain, $best_match;
 119
 120             if ( $best_match->[ Q_BEG ] - $q_min >= 2 and $best_match->[ S_BEG ] - $s_min >= 2 ) {
 121                 push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $best_match->[ Q_BEG ] - 1, $s_min, $best_match->[ S_BEG ] - 1 );   # left search space
 122             }
 123
 124             if ( $q_max - $best_match->[ Q_END ] >= 2 and $s_max - $best_match->[ S_END ] >= 2 ) {
 125                 push @chain, align_two_seq( $q_seq, $s_seq, $matches, $best_match->[ Q_END ] + 1, $q_max, $best_match->[ S_END ] + 1, $s_max );   # right search space
 126             }
 127         }
 128     }
 129
 130     return wantarray ? @chain : \@chain;
 131 }
 132
 133
 134 sub select_matches
 135 {
 136     # Martin A. Hansen, August 2006.
 137
 138     # Given a list of matches and a search space,
 139     # include only those matches contained within
 140     # this search space.
 141
 142     my ( $matches,   # list of matches
 143          $q_min,     # q sequence start position
 144          $q_max,     # q sequecne stop  position
 145          $s_min,     # s sequence start position
 146          $s_max,     # s sequecne stop  position
 147        ) = @_;
 148
 149     # returns list of matches
 150
 151     my ( @matches );
 152
 153     @matches = grep { $_->[ Q_BEG ] >= $q_min and
 154                       $_->[ S_BEG ] >= $s_min and
 155                       $_->[ Q_END ] <= $q_max and
 156                       $_->[ S_END ] <= $s_max } @{ $matches };
 157
 158     return wantarray ? @matches : \@matches;
 159 }
 160
 161
 162 sub find_wordsize
 163 {
 164     # Martin A. Hansen, August 2006.
 165
 166     # Given a search space calculates the wordsize for a word so a match
 167     # occurs only a few times. More matches may be needed at low similarity in
 168     # order to avoid starting with a wrong match.
 169
 170     my ( $q_min,   # q sequence start position
 171          $q_max,   # q sequecne stop position
 172          $s_min,   # s sequence start position
 173          $s_max,   # s sequecne stop position
 174        ) = @_;
 175
 176     # returns integer
 177
 178     my ( $q_dim, $s_dim, $dim_min, $wordsize );
 179
 180     $q_dim = $q_max - $q_min + 1;
 181     $s_dim = $s_max - $s_min + 1;
 182
 183     $dim_min = Maasha::Calc::min( $q_dim, $s_dim );
 184
 185     $wordsize = 1;
 186
 187     if ( $dim_min > 2000000 )               # optimized for DNA
 188     {
 189         $wordsize = 50;
 190     }
 191     elsif ( $dim_min > 100000 )             # optimized for DNA
 192     {
 193         $wordsize = 25;
 194     }
 195     elsif ( $q_dim > 100 or $s_dim > 100 )  # optimized for DNA
 196     {
 197         while ( ALPH_LEN ** $wordsize <= $q_dim * $s_dim and $wordsize < $dim_min ) {
 198             $wordsize++;
 199         }
 200     }
 201     else
 202     {
 203         while ( ALPH_LEN ** $wordsize <= $dim_min and $wordsize < $dim_min ) {
 204             $wordsize++;
 205         }
 206     }
 207
 208     return $wordsize;
 209 }
 210
 211
 212 sub find_matches
 213 {
 214     # Martin A. Hansen, November 2006
 215
 216     # given two sequences, find all maximum expanded matches between these
 217
 218     my ( $q_seq,      # sequence 1
 219          $s_seq,      # sequence 2
 220          $wordsize,   # word size
 221          $q_min,      # q sequence start position
 222          $q_max,      # q sequecne stop  position
 223          $s_min,      # s sequence start position
 224          $s_max,      # s sequecne stop  position
 225        ) = @_;
 226
 227     # returns list of matches
 228
 229     my ( $q_beg, $q_word, %word_hash, $s_beg, $s_word, $match, @matches );
 230
 231     if ( length ${ $s_seq } > length ${ $q_seq } )
 232     {
 233         for ( $q_beg = $q_min; $q_beg <= $q_max - $wordsize + 1; $q_beg++ )
 234         {
 235             $q_word = lc substr ${ $q_seq }, $q_beg, $wordsize;
 236
 237             next if $q_word =~ /n/i; #   DNA/genome optimization
 238
 239             push @{ $word_hash{ $q_word } }, $q_beg;
 240         }
 241
 242         for ( $s_beg = $s_min; $s_beg <= $s_max - $wordsize + 1; $s_beg++ )
 243         {
 244             $s_word = lc substr ${ $s_seq }, $s_beg, $wordsize;
 245
 246             if ( exists $word_hash{ $s_word } )
 247             {
 248                 foreach $q_beg ( @{ $word_hash{ $s_word } } )
 249                 {
 250                     $match = [ $q_beg, $q_beg + $wordsize - 1, $s_beg, $s_beg + $wordsize - 1 ];
 251
 252                     if ( grep { $match->[ Q_BEG ] >= $_->[ Q_BEG ] and
 253                                 $match->[ Q_END ] <= $_->[ Q_END ] and
 254                                 $match->[ S_BEG ] >= $_->[ S_BEG ] and
 255                                 $match->[ S_END ] <= $_->[ S_END ] } @matches )
 256                     {
 257                         next;   # match is redundant
 258                     }
 259                     else
 260                     {
 261                         $match = expand_match( $q_seq, $s_seq, $match, $q_max, $q_min, $s_max, $s_min );
 262                         $match->[ LEN ] = $match->[ Q_END ] - $match->[ Q_BEG ] + 1;
 263
 264                         push @matches, $match;
 265                     }
 266                 }
 267             }
 268         }
 269     }
 270     else
 271     {
 272         for ( $s_beg = $s_min; $s_beg <= $s_max - $wordsize + 1; $s_beg++ )
 273         {
 274             $s_word = lc substr ${ $s_seq }, $s_beg, $wordsize;
 275
 276             next if $s_word =~ /n/i; #   DNA/genome optimization
 277
 278             push @{ $word_hash{ $s_word } }, $s_beg;
 279         }
 280
 281         for ( $q_beg = $q_min; $q_beg <= $q_max - $wordsize + 1; $q_beg++ )
 282         {
 283             $q_word = lc substr ${ $q_seq }, $q_beg, $wordsize;
 284
 285             if ( exists $word_hash{ $q_word } )
 286             {
 287                 foreach $s_beg ( @{ $word_hash{ $q_word } } )
 288                 {
 289                     $match = [ $q_beg, $q_beg + $wordsize - 1, $s_beg, $s_beg + $wordsize - 1 ];
 290
 291                     if ( grep { $match->[ Q_BEG ] >= $_->[ Q_BEG ] and
 292                                 $match->[ Q_END ] <= $_->[ Q_END ] and
 293                                 $match->[ S_BEG ] >= $_->[ S_BEG ] and
 294                                 $match->[ S_END ] <= $_->[ S_END ] } @matches )
 295                     {
 296                         next;   # match is redundant
 297                     }
 298                     else
 299                     {
 300                         $match = expand_match( $q_seq, $s_seq, $match, $q_max, $q_min, $s_max, $s_min );
 301                         $match->[ LEN ] = $match->[ Q_END ] - $match->[ Q_BEG ] + 1;
 302
 303                         push @matches, $match;
 304                     }
 305                 }
 306             }
 307         }
 308     }
 309
 310     return wantarray ? @matches : \@matches;
 311 }
 312
 313
 314 sub expand_match
 315 {
 316     # Martin A. Hansen, August 2006.
 317
 318     # Given two sequences and a match, expand the match maximally.
 319     # A match is defined like this: [ Q_BEG, Q_END, S_BEG, S_END ]
 320
 321     my ( $q_seq,   # sequence 1 ref
 322          $s_seq,   # sequence 2 ref
 323          $match,   # sequence match
 324          $q_max,   # q sequecne stop position
 325          $q_min,   # q sequence start position
 326          $s_max,   # s sequecne stop position
 327          $s_min,   # s sequence start position
 328        ) = @_;
 329
 330     # returns match
 331
 332     my ( $q_pos, $s_pos );
 333
 334     # expanding forward
 335
 336     $q_pos = $match->[ Q_END ] + 1;
 337     $s_pos = $match->[ S_END ] + 1;
 338
 339     while ( $q_pos <= $q_max and $s_pos <= $s_max and lc substr( ${ $q_seq }, $q_pos, 1 ) eq lc substr( ${ $s_seq }, $s_pos, 1 ) )
 340     {
 341         $q_pos++;
 342         $s_pos++;
 343     }
 344
 345     $match->[ Q_END ] = $q_pos - 1;
 346     $match->[ S_END ] = $s_pos - 1;
 347
 348     # expanding backwards
 349
 350     $q_pos = $match->[ Q_BEG ] - 1;
 351     $s_pos = $match->[ S_BEG ] - 1;
 352
 353     while ( $q_pos >= $q_min and $s_pos >= $s_min and lc substr( ${ $q_seq }, $q_pos, 1 ) eq lc substr( ${ $s_seq }, $s_pos, 1 ) )
 354     {
 355         $q_pos--;
 356         $s_pos--;
 357     }
 358
 359     $match->[ Q_BEG ] = $q_pos + 1;
 360     $match->[ S_BEG ] = $s_pos + 1;
 361
 362     return $match;
 363 }
 364
 365
 366 sub score_match
 367 {
 368     # Martin A. Hansen, August 2006
 369
 370     # given a match and a search space scores the match according to three criteria:
 371
 372     # 1) length of match - the longer the better.
 373     # 2) distance to closest corner - the shorter the better.
 374     # 3) distance to closest narrow end of the search space - the shorter the better.
 375
 376     # each of these scores are divided by search space dimentions, and the total score
 377     # is calculated: total = score_len - score_corner - score_narrow
 378
 379     # the higher the score, the better the match.
 380
 381     my ( $match,   # match
 382          $q_min,   # q sequence start position
 383          $q_max,   # q sequecne stop  position
 384          $s_min,   # s sequence start position
 385          $s_max,   # s sequecne stop  position
 386        ) = @_;
 387
 388     # returns a positive number
 389
 390     my ( $q_dim, $s_dim, $score_len, $score_corner, $score_narrow, $score_total, $beg_diag_dist, $end_diag_dist,
 391          $min_diag_dist, $score_diag, $beg_narrow_dist, $end_narrow_dist, $max_narrow_dist );
 392
 393     # ----- 1) scoring according to match length
 394
 395     $score_len = $match->[ LEN ] ** 3;
 396
 397     # ---- 2) score according to distance away from diagonal
 398
 399     $q_dim = $q_max - $q_min + 1;
 400     $s_dim = $s_max - $s_min + 1;
 401
 402     if ( $q_dim >= $s_dim ) # s_dim is the narrow end
 403     {
 404         $beg_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_min, $s_min, $q_min + $s_dim, $s_min + $s_dim );
 405         $end_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_max - $s_dim, $s_max - $s_dim, $q_max, $s_max );
 406     }
 407     else
 408     {
 409         $beg_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_min, $s_min, $q_min + $q_dim, $s_min + $q_dim );
 410         $end_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_max - $q_dim, $s_max - $q_dim, $q_max, $s_max );
 411     }
 412
 413     $min_diag_dist = Maasha::Calc::min( $beg_diag_dist, $end_diag_dist );
 414
 415     $score_diag = 2 * $min_diag_dist ** 2;
 416
 417     # ----- 3) scoring according to distance to the narrow end of the search space
 418
 419     if ( $q_dim > $s_dim ) # s_dim is the narrow end
 420     {
 421         $beg_narrow_dist = $match->[ Q_BEG ] - $q_min;
 422         $end_narrow_dist = $q_max - $match->[ Q_BEG ];
 423
 424         $max_narrow_dist = Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist );
 425     }
 426     elsif ( $q_dim < $s_dim )
 427     {
 428         $beg_narrow_dist = $match->[ S_BEG ] - $s_min;
 429         $end_narrow_dist = $s_max - $match->[ S_BEG ];
 430
 431         $max_narrow_dist = Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist );
 432     }
 433     else
 434     {
 435         $max_narrow_dist = 0;
 436     }
 437
 438     $score_narrow = $max_narrow_dist;
 439
 440     $score_total = $score_len - $score_narrow - $score_diag;
 441
 442     return $score_total;
 443 }
 444
 445
 446 sub score_match_niels
 447 {
 448     # Niels Larsen, June 2004.
 449
 450     # Creates a crude "heuristic" attempt of telling how likely it is that a
 451     # given match occurs by chance in a given search space. If sequences are
 452     # given their composition is taken into account. The scoring punishes
 453     # distance from diagonal(s) and distance from previous match(es). Scores
 454     # range from zero and up, and lower is better.
 455
 456     my ( $match,   # Match array
 457          $q_seq,   # Either q_seq or s_seq
 458          $q_min,   # Lower bound search area (query sequence)
 459          $q_max,   # Upper bound search area (query sequence)
 460          $s_min,   # Lower bound search area (subject sequence)
 461          $s_max,   # Upper bound search area (subject sequence)
 462          ) = @_;
 463
 464     # Returns a positive number.
 465
 466     my ( $q_beg, $s_beg, $q_end, $s_end, $q_dim, $s_dim, $seq, $pos,
 467          $q_delta_beg, $s_delta_beg, $q_delta_end, $s_delta_end, $i,
 468          $delta_beg_max, $delta_end_max, $as, $gs, $ts, $cs, $pmatch,
 469          $score, $moves, $dist_beg, $dist_end, $seqlen, %chars, $delta,
 470          $delta_max );
 471
 472     $q_beg = $match->[Q_BEG];
 473     $q_end = $match->[Q_END];
 474     $s_beg = $match->[S_BEG];
 475     $s_end = $match->[S_END];
 476
 477     # >>>>>>>>>>>>>>>>>>>>>>> CRUDE INITIAL SCORE <<<<<<<<<<<<<<<<<<<<<<
 478
 479     # Get the probability of a match from the sequence composition (when
 480     # match is longer than 20 and sequence is given, otherwise use 0.25)
 481     # and raise that to the power of the length.
 482
 483     if ( $match->[LEN] > 20 and defined $q_seq )
 484     {
 485         $seq = substr ${ $q_seq }, $q_beg, $q_end-$q_beg+1;
 486         $seqlen = length $seq;
 487
 488         $chars{"a"} = $chars{"g"} = $chars{"c"} = $chars{"t"} = 0;
 489
 490         for ( $i = 0; $i < $seqlen; $i++ )
 491         {
 492             $chars{ substr $seq, $i, 1 }++;
 493         }
 494
 495         $pmatch = ($chars{"a"}/$seqlen)**2 + ($chars{"c"}/$seqlen)**2
 496                 + ($chars{"g"}/$seqlen)**2 + ($chars{"t"}/$seqlen)**2;
 497     }
 498     else {
 499         $pmatch = 0.25;
 500     }
 501
 502     $score = $pmatch ** ( $q_end - $q_beg + 1 );
 503
 504 #     # Punish by difference in height and width of search space,
 505
 506 #     $q_dim = $q_max - $q_min + 1;
 507 #     $s_dim = $s_max - $s_min + 1;
 508
 509 #     if ( $q_dim != $s_dim ) {
 510 #         $score *= abs ( $q_dim - $s_dim ) ** 2;
 511 #     }
 512
 513     return $score if $score > 0.25;
 514
 515     # Punish by how far the match is to the closest corner of the search
 516     # space,
 517
 518     $q_delta_beg = $q_beg - $q_min;
 519     $s_delta_beg = $s_beg - $s_min;
 520
 521     $q_delta_end = $q_max - $q_end;
 522     $s_delta_end = $s_max - $s_end;
 523
 524     if ( $q_delta_beg > $s_delta_beg ) {
 525         $delta_beg_max = $q_delta_beg;
 526     } else {
 527         $delta_beg_max = $s_delta_beg;
 528     }
 529
 530     if ( $q_delta_end > $s_delta_end ) {
 531         $delta_end_max = $q_delta_end;
 532     } else {
 533         $delta_end_max = $s_delta_end;
 534     }
 535
 536     if ( $delta_beg_max <= $delta_end_max ) {
 537         $score *= ($delta_beg_max+1) ** 2.0;
 538     } else {
 539         $score *= ($delta_end_max+1) ** 2.0;
 540     }
 541
 542     return $score if $score > 0.25;
 543
 544     # Add penalty if the match is towards the narrow end of the
 545     # search space,
 546
 547     if ( ($q_max - $q_min) <= ($s_max - $s_min) )
 548     {
 549         if ( $q_delta_beg > $s_delta_beg )
 550         {
 551             $score *= 2 * ( $q_delta_beg - $s_delta_beg ) ** 3;
 552         }
 553         elsif ( $q_delta_end > $s_delta_end )
 554         {
 555             $score *= 2 * ( $q_delta_end - $s_delta_end ) ** 3;
 556         }
 557     }
 558     else
 559     {
 560         if ( $s_delta_beg > $q_delta_beg )
 561         {
 562             $score *= 2 * ( $s_delta_beg - $q_delta_beg ) ** 3;
 563         }
 564         elsif ( $s_delta_end > $q_delta_end )
 565         {
 566             $score *= 2 * ( $s_delta_end - $q_delta_end ) ** 3;
 567         }
 568     }
 569
 570     if ( $score < 0 ) {
 571         print STDERR "q_min, q_max, s_min, s_max: $q_min, $q_max, $s_min, $s_max\n";
 572         die qq (Score <= 0 -> $score);
 573     }
 574
 575     return $score;
 576 }
 577
 578
 579 sub insert_indels
 580 {
 581     # Martin A. Hansen, June 2009.
 582
 583     # Given a list of matches and references to q_seq and s_seq,
 584     # insert indels to form aligned sequences.
 585
 586     my ( $matches,   # list of matches
 587          $q_seq,     # ref to query sequence
 588          $s_seq,     # ref to subject sequence
 589        ) = @_;
 590
 591     # Returns nothing.
 592
 593     my ( $q_indels, $s_indels, $match, $diff, $first );
 594
 595     @{ $matches } = sort { $a->[ Q_BEG ] <=> $b->[ Q_BEG ] } @{ $matches };
 596
 597     $first    = 1;
 598     $q_indels = 0;
 599     $s_indels = 0;
 600
 601     # ---- initial indels ----
 602
 603     $match = shift @{ $matches };
 604
 605     substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ], uc substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ];
 606     substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ], uc substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ];
 607
 608     $diff = ( $match->[ Q_BEG ] + $q_indels ) - ( $match->[ S_BEG ] + $s_indels );
 609
 610     if ( $diff < 0 )
 611     {
 612         substr ${ $q_seq }, 0, 0, '-' x abs $diff;
 613
 614         $q_indels += abs $diff;
 615     }
 616     elsif ( $diff > 0 )
 617     {
 618         substr ${ $s_seq }, 0, 0, '-' x abs $diff;
 619
 620         $s_indels += abs $diff;
 621     }
 622
 623     # ---- internal indels ----
 624
 625     foreach $match ( @{ $matches } )
 626     {
 627         substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ], uc substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ];
 628         substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ], uc substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ];
 629
 630         $diff = ( $match->[ Q_BEG ] + $q_indels ) - ( $match->[ S_BEG ] + $s_indels );
 631
 632         if ( $diff < 0 )
 633         {
 634             substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, 0, '-' x abs $diff;
 635
 636             $q_indels += abs $diff;
 637         }
 638         elsif ( $diff > 0 )
 639         {
 640             substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, 0, '-' x abs $diff;
 641
 642             $s_indels += abs $diff;
 643         }
 644     }
 645
 646     # ---- last indels ----
 647
 648     $diff = length( ${ $s_seq } ) - length( ${ $q_seq } );
 649
 650     if ( $diff > 0 ) {
 651          ${ $q_seq } .= '-' x abs $diff;
 652     } else {
 653          ${ $s_seq } .= '-' x abs $diff;
 654     }
 655 }
 656
 657
 658 sub align_sim_local
 659 {
 660     # Martin A. Hansen, May 2007.
 661
 662     # Calculate the local similarity of an alignment based on
 663     # an alignment chain. The similarity is calculated as
 664     # the number of matching residues divided by the overall
 665     # length of the alignment chain. This means that a short
 666     # but "good" alignment will yield a high similarity, while
 667     # a long "poor" alignment will yeild a low similarity.
 668
 669     my ( $chain,   # list of matches in alignment
 670        ) = @_;
 671
 672     # returns a float
 673
 674     my ( $match, $match_tot, $q_beg, $q_end, $s_beg, $s_end, $q_diff, $s_diff, $max, $sim );
 675
 676     return 0 if not @{ $chain };
 677
 678     $match_tot = 0;
 679     $q_end     = 0;
 680     $s_end     = 0;
 681     $q_beg     = 999999999;
 682     $s_beg     = 999999999;
 683
 684     foreach $match ( @{ $chain } )
 685     {
 686         $match_tot += $match->[ LEN ];
 687
 688         $q_beg = $match->[ Q_BEG ] if $match->[ Q_BEG ] < $q_beg;
 689         $s_beg = $match->[ S_BEG ] if $match->[ S_BEG ] < $s_beg;
 690
 691         $q_end = $match->[ Q_END ] if $match->[ Q_END ] > $q_end;
 692         $s_end = $match->[ S_END ] if $match->[ S_END ] > $s_end;
 693     }
 694
 695     $q_diff = $q_end - $q_beg + 1;
 696     $s_diff = $s_end - $s_beg + 1;
 697
 698     $max = Maasha::Calc::max( $q_diff, $s_diff );
 699
 700     $sim = sprintf( "%.2f", ( $match_tot / $max ) * 100 );
 701
 702     return $sim;
 703 }
 704
 705
 706 sub align_sim_global
 707 {
 708     # Martin A. Hansen, June 2007.
 709
 710     # Calculate the global similarity of an alignment based on
 711     # an alignment chain. The similarity is calculated as
 712     # the number of matching residues divided by the
 713     # length of the shortest sequence.
 714
 715     my ( $chain,   # list of matches in alignment
 716          $q_seq,   # ref to query sequence
 717          $s_seq,   # ref to subject sequence
 718        ) = @_;
 719
 720     # returns a float
 721
 722     my ( $match_tot, $min, $sim );
 723
 724     return 0 if not @{ $chain };
 725
 726     $match_tot = 0;
 727
 728     map { $match_tot += $_->[ LEN ] } @{ $chain };
 729
 730     $min = Maasha::Calc::min( length( ${ $q_seq } ), length( ${ $s_seq } ) );
 731
 732     $sim = sprintf( "%.2f", ( $match_tot / $min ) * 100 );
 733
 734     return $sim;
 735 }
 736
 737
 738 sub align_consensus
 739 {
 740     # Martin A. Hansen, June 2006.
 741
 742     # Given an alignment as a list of FASTA entries,
 743     # generates a consensus sequences based on the
 744     # entropies for each column similar to the way
 745     # a sequence logo i calculated. Returns the
 746     # consensus sequence as a FASTA entry.
 747
 748     my ( $entries,   # list of aligned FASTA entries
 749          $type,      # residue type - OPTIONAL
 750          $min_sim,   # minimum similarity - OPTIONAL
 751        ) = @_;
 752
 753     # Returns tuple
 754
 755     my ( $bit_max, $data, $pos, $char, $score, $entry );
 756
 757     $type    ||= Maasha::Seq::seq_guess_type( $entries->[ 0 ] );
 758     $min_sim ||= 50;
 759
 760     if ( $type =~ /protein/ ) {
 761         $bit_max = 4;
 762     } else {
 763         $bit_max = 2;
 764     }
 765
 766     $data = Maasha::Seq::seqlogo_calc( $bit_max, $entries );
 767
 768     foreach $pos ( @{ $data } )
 769     {
 770         ( $char, $score ) = @{ $pos->[ -1 ] };
 771
 772         if ( ( $score / $bit_max ) * 100 >= $min_sim ) {
 773             $entry->[ SEQ ] .= $char;
 774         } else {
 775             $entry->[ SEQ ] .= "-";
 776         }
 777     }
 778
 779     $entry->[ HEAD ] = "Consensus: $min_sim%";
 780
 781     return wantarray ? @{ $entry } : $entry;
 782 }
 783
 784
 785 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<