code_perl/Maasha/AlignTwoSeq.pm

   1 package Maasha::AlignTwoSeq;
   2
   3 # Copyright (C) 2007 Martin A. Hansen.
   4
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; either version 2
   8 # of the License, or (at your option) any later version.
   9
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18
  19 # http://www.gnu.org/copyleft/gpl.html
  20
  21
  22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  23
  24
  25 # yak yak yak
  26
  27
  28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  29
  30
  31 use strict;
  32 use Data::Dumper;
  33 use Storable qw( dclone );
  34 use Maasha::Calc;
  35 use Maasha::Seq;
  36 use vars qw ( @ISA @EXPORT );
  37
  38 use constant {
  39     Q_BEG    => 0,
  40     Q_END    => 1,
  41     S_BEG    => 2,
  42     S_END    => 3,
  43     LEN      => 4,
  44     SCORE    => 5,
  45     HEAD     => 0,
  46     SEQ      => 1,
  47     ALPH_LEN => 4,
  48 };
  49
  50 @ISA = qw( Exporter );
  51
  52
  53 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  54
  55
  56 sub align_two_seq
  57 {
  58     # Martin A. Hansen, August 2006.
  59
  60     # Generates an alignment by chaining matches, which are subsequences
  61     # shared between two sequences. The routine functions by considering
  62     # only matches within a given search space. If no matches are given
  63     # these will be generated, if long matches are found these will be
  64     # included in the alignment, otherwise matches will be included depending
  65     # on a calculated score. New search spaces spanning the spaces between
  66     # matches and the search space boundaries will be cast and recursed into.
  67
  68     my ( $q_seq,     # sequence 1 ref
  69          $s_seq,     # sequence 2 ref
  70          $matches,   # list of matches
  71          $q_min,     # q sequence start position
  72          $q_max,     # q sequecne stop  position
  73          $s_min,     # s sequence start position
  74          $s_max,     # s sequecne stop  position
  75        ) = @_;
  76
  77     # returns a chain of matches that can be chained into an alignment
  78
  79     $matches ||= [];
  80     $q_min   ||= 0;
  81     $s_min   ||= 0;
  82     $q_max   ||= length( ${ $q_seq } ) - 1;
  83     $s_max   ||= length( ${ $s_seq } ) - 1;
  84
  85     my ( $wordsize, @chain, $match, $best_match, @long_matches );
  86
  87     $matches = select_matches( $matches, $q_min, $q_max, $s_min, $s_max );
  88
  89     if ( scalar @{ $matches } == 0 )   # no matches - find some!
  90     {
  91         $wordsize = find_wordsize( $q_min, $q_max, $s_min, $s_max );
  92         $matches  = find_matches( $q_seq, $s_seq, $wordsize, $q_min, $q_max, $s_min, $s_max );
  93
  94         while ( scalar @{ $matches } == 0 and $wordsize > 1 )
  95         {
  96             $wordsize--;
  97             $matches = find_matches( $q_seq, $s_seq, $wordsize, $q_min, $q_max, $s_min, $s_max );
  98         }
  99
 100         if ( scalar @{ $matches } > 0 ) {
 101             push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $q_max, $s_min, $s_max );
 102         }
 103     }
 104     else   # matches are included according to score
 105     {
 106         foreach $match ( @{ $matches } ) {
 107             $match->[ SCORE ] = score_match( $match, $q_min, $q_max, $s_min, $s_max );
 108         }
 109
 110         @{ $matches } = grep { $_->[ SCORE ] > 0 } @{ $matches };
 111         @{ $matches } = sort { $b->[ SCORE ] <=> $a->[ SCORE ] } @{ $matches };
 112
 113         $best_match   = shift @{ $matches };
 114
 115         if ( $best_match )
 116         {
 117             push @chain, $best_match;
 118
 119             if ( $best_match->[ Q_BEG ] - $q_min >= 2 and $best_match->[ S_BEG ] - $s_min >= 2 ) {
 120                 push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $best_match->[ Q_BEG ] - 1, $s_min, $best_match->[ S_BEG ] - 1 );   # left search space
 121             }
 122
 123             if ( $q_max - $best_match->[ Q_END ] >= 2 and $s_max - $best_match->[ S_END ] >= 2 ) {
 124                 push @chain, align_two_seq( $q_seq, $s_seq, $matches, $best_match->[ Q_END ] + 1, $q_max, $best_match->[ S_END ] + 1, $s_max );   # right search space
 125             }
 126         }
 127     }
 128
 129     return wantarray ? @chain : \@chain;
 130 }
 131
 132
 133 sub select_matches
 134 {
 135     # Martin A. Hansen, August 2006.
 136
 137     # Given a list of matches and a search space,
 138     # include only those matches contained within
 139     # this search space.
 140
 141     my ( $matches,   # list of matches
 142          $q_min,     # q sequence start position
 143          $q_max,     # q sequecne stop  position
 144          $s_min,     # s sequence start position
 145          $s_max,     # s sequecne stop  position
 146        ) = @_;
 147
 148     # returns list of matches
 149
 150     my ( @matches );
 151
 152     @matches = grep { $_->[ Q_BEG ] >= $q_min and
 153                       $_->[ S_BEG ] >= $s_min and
 154                       $_->[ Q_END ] <= $q_max and
 155                       $_->[ S_END ] <= $s_max } @{ $matches };
 156
 157     return wantarray ? @matches : \@matches;
 158 }
 159
 160
 161 sub find_wordsize
 162 {
 163     # Martin A. Hansen, August 2006.
 164
 165     # Given a search space calculates the wordsize for a word so a match
 166     # occurs only a few times. More matches may be needed at low similarity in
 167     # order to avoid starting with a wrong match.
 168
 169     my ( $q_min,   # q sequence start position
 170          $q_max,   # q sequecne stop position
 171          $s_min,   # s sequence start position
 172          $s_max,   # s sequecne stop position
 173        ) = @_;
 174
 175     # returns integer
 176
 177     my ( $q_dim, $s_dim, $dim_min, $wordsize );
 178
 179     $q_dim = $q_max - $q_min + 1;
 180     $s_dim = $s_max - $s_min + 1;
 181
 182     $dim_min = Maasha::Calc::min( $q_dim, $s_dim );
 183
 184     $wordsize = 1;
 185
 186     if ( $dim_min > 2000000 )               # optimized for DNA
 187     {
 188         $wordsize = 50;
 189     }
 190     elsif ( $dim_min > 100000 )             # optimized for DNA
 191     {
 192         $wordsize = 25;
 193     }
 194     elsif ( $q_dim > 100 or $s_dim > 100 )  # optimized for DNA
 195     {
 196         while ( ALPH_LEN ** $wordsize <= $q_dim * $s_dim and $wordsize < $dim_min ) {
 197             $wordsize++;
 198         }
 199     }
 200     else
 201     {
 202         while ( ALPH_LEN ** $wordsize <= $dim_min and $wordsize < $dim_min ) {
 203             $wordsize++;
 204         }
 205     }
 206
 207     return $wordsize;
 208 }
 209
 210
 211 sub find_matches
 212 {
 213     # Martin A. Hansen, November 2006
 214
 215     # given two sequences, find all maximum expanded matches between these
 216
 217     my ( $q_seq,      # sequence 1
 218          $s_seq,      # sequence 2
 219          $wordsize,   # word size
 220          $q_min,      # q sequence start position
 221          $q_max,      # q sequecne stop  position
 222          $s_min,      # s sequence start position
 223          $s_max,      # s sequecne stop  position
 224        ) = @_;
 225
 226     # returns list of matches
 227
 228     my ( $q_beg, $q_word, %word_hash, $s_beg, $s_word, $match, @matches );
 229
 230     if ( length ${ $s_seq } > length ${ $q_seq } )
 231     {
 232         for ( $q_beg = $q_min; $q_beg <= $q_max - $wordsize + 1; $q_beg++ )
 233         {
 234             $q_word = lc substr ${ $q_seq }, $q_beg, $wordsize;
 235
 236             next if $q_word =~ /n/i; #   DNA/genome optimization
 237
 238             push @{ $word_hash{ $q_word } }, $q_beg;
 239         }
 240
 241         for ( $s_beg = $s_min; $s_beg <= $s_max - $wordsize + 1; $s_beg++ )
 242         {
 243             $s_word = lc substr ${ $s_seq }, $s_beg, $wordsize;
 244
 245             if ( exists $word_hash{ $s_word } )
 246             {
 247                 foreach $q_beg ( @{ $word_hash{ $s_word } } )
 248                 {
 249                     $match = [ $q_beg, $q_beg + $wordsize - 1, $s_beg, $s_beg + $wordsize - 1 ];
 250
 251                     if ( grep { $match->[ Q_BEG ] >= $_->[ Q_BEG ] and
 252                                 $match->[ Q_END ] <= $_->[ Q_END ] and
 253                                 $match->[ S_BEG ] >= $_->[ S_BEG ] and
 254                                 $match->[ S_END ] <= $_->[ S_END ] } @matches )
 255                     {
 256                         next;   # match is redundant
 257                     }
 258                     else
 259                     {
 260                         $match = expand_match( $q_seq, $s_seq, $match, $q_max, $q_min, $s_max, $s_min );
 261                         $match->[ LEN ] = $match->[ Q_END ] - $match->[ Q_BEG ] + 1;
 262
 263                         push @matches, $match;
 264                     }
 265                 }
 266             }
 267         }
 268     }
 269     else
 270     {
 271         for ( $s_beg = $s_min; $s_beg <= $s_max - $wordsize + 1; $s_beg++ )
 272         {
 273             $s_word = lc substr ${ $s_seq }, $s_beg, $wordsize;
 274
 275             next if $s_word =~ /n/i; #   DNA/genome optimization
 276
 277             push @{ $word_hash{ $s_word } }, $s_beg;
 278         }
 279
 280         for ( $q_beg = $q_min; $q_beg <= $q_max - $wordsize + 1; $q_beg++ )
 281         {
 282             $q_word = lc substr ${ $q_seq }, $q_beg, $wordsize;
 283
 284             if ( exists $word_hash{ $q_word } )
 285             {
 286                 foreach $s_beg ( @{ $word_hash{ $q_word } } )
 287                 {
 288                     $match = [ $q_beg, $q_beg + $wordsize - 1, $s_beg, $s_beg + $wordsize - 1 ];
 289
 290                     if ( grep { $match->[ Q_BEG ] >= $_->[ Q_BEG ] and
 291                                 $match->[ Q_END ] <= $_->[ Q_END ] and
 292                                 $match->[ S_BEG ] >= $_->[ S_BEG ] and
 293                                 $match->[ S_END ] <= $_->[ S_END ] } @matches )
 294                     {
 295                         next;   # match is redundant
 296                     }
 297                     else
 298                     {
 299                         $match = expand_match( $q_seq, $s_seq, $match, $q_max, $q_min, $s_max, $s_min );
 300                         $match->[ LEN ] = $match->[ Q_END ] - $match->[ Q_BEG ] + 1;
 301
 302                         push @matches, $match;
 303                     }
 304                 }
 305             }
 306         }
 307     }
 308
 309     return wantarray ? @matches : \@matches;
 310 }
 311
 312
 313 sub expand_match
 314 {
 315     # Martin A. Hansen, August 2006.
 316
 317     # Given two sequences and a match, expand the match maximally.
 318     # A match is defined like this: [ Q_BEG, Q_END, S_BEG, S_END ]
 319
 320     my ( $q_seq,   # sequence 1 ref
 321          $s_seq,   # sequence 2 ref
 322          $match,   # sequence match
 323          $q_max,   # q sequecne stop position
 324          $q_min,   # q sequence start position
 325          $s_max,   # s sequecne stop position
 326          $s_min,   # s sequence start position
 327        ) = @_;
 328
 329     # returns match
 330
 331     my ( $q_pos, $s_pos );
 332
 333     # expanding forward
 334
 335     $q_pos = $match->[ Q_END ] + 1;
 336     $s_pos = $match->[ S_END ] + 1;
 337
 338     while ( $q_pos <= $q_max and $s_pos <= $s_max and lc substr( ${ $q_seq }, $q_pos, 1 ) eq lc substr( ${ $s_seq }, $s_pos, 1 ) )
 339     {
 340         $q_pos++;
 341         $s_pos++;
 342     }
 343
 344     $match->[ Q_END ] = $q_pos - 1;
 345     $match->[ S_END ] = $s_pos - 1;
 346
 347     # expanding backwards
 348
 349     $q_pos = $match->[ Q_BEG ] - 1;
 350     $s_pos = $match->[ S_BEG ] - 1;
 351
 352     while ( $q_pos >= $q_min and $s_pos >= $s_min and lc substr( ${ $q_seq }, $q_pos, 1 ) eq lc substr( ${ $s_seq }, $s_pos, 1 ) )
 353     {
 354         $q_pos--;
 355         $s_pos--;
 356     }
 357
 358     $match->[ Q_BEG ] = $q_pos + 1;
 359     $match->[ S_BEG ] = $s_pos + 1;
 360
 361     return $match;
 362 }
 363
 364
 365 sub score_match
 366 {
 367     # Martin A. Hansen, August 2006
 368
 369     # given a match and a search space scores the match according to three criteria:
 370
 371     # 1) length of match - the longer the better.
 372     # 2) distance to closest corner - the shorter the better.
 373     # 3) distance to closest narrow end of the search space - the shorter the better.
 374
 375     # each of these scores are divided by search space dimentions, and the total score
 376     # is calculated: total = score_len - score_corner - score_narrow
 377
 378     # the higher the score, the better the match.
 379
 380     my ( $match,   # match
 381          $q_min,   # q sequence start position
 382          $q_max,   # q sequecne stop  position
 383          $s_min,   # s sequence start position
 384          $s_max,   # s sequecne stop  position
 385        ) = @_;
 386
 387     # returns a positive number
 388
 389     my ( $q_dim, $s_dim, $score_len, $score_corner, $score_narrow, $score_total, $beg_diag_dist, $end_diag_dist,
 390          $min_diag_dist, $score_diag, $beg_narrow_dist, $end_narrow_dist, $max_narrow_dist );
 391
 392     # ----- 1) scoring according to match length
 393
 394     $score_len = $match->[ LEN ] ** 3;
 395
 396     # ---- 2) score according to distance away from diagonal
 397
 398     $q_dim = $q_max - $q_min + 1;
 399     $s_dim = $s_max - $s_min + 1;
 400
 401     if ( $q_dim >= $s_dim ) # s_dim is the narrow end
 402     {
 403         $beg_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_min, $s_min, $q_min + $s_dim, $s_min + $s_dim );
 404         $end_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_max - $s_dim, $s_max - $s_dim, $q_max, $s_max );
 405     }
 406     else
 407     {
 408         $beg_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_min, $s_min, $q_min + $q_dim, $s_min + $q_dim );
 409         $end_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_max - $q_dim, $s_max - $q_dim, $q_max, $s_max );
 410     }
 411
 412     $min_diag_dist = Maasha::Calc::min( $beg_diag_dist, $end_diag_dist );
 413
 414     $score_diag = 2 * $min_diag_dist ** 2;
 415
 416     # ----- 3) scoring according to distance to the narrow end of the search space
 417
 418     if ( $q_dim > $s_dim ) # s_dim is the narrow end
 419     {
 420         $beg_narrow_dist = $match->[ Q_BEG ] - $q_min;
 421         $end_narrow_dist = $q_max - $match->[ Q_BEG ];
 422
 423         $max_narrow_dist = Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist );
 424     }
 425     elsif ( $q_dim < $s_dim )
 426     {
 427         $beg_narrow_dist = $match->[ S_BEG ] - $s_min;
 428         $end_narrow_dist = $s_max - $match->[ S_BEG ];
 429
 430         $max_narrow_dist = Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist );
 431     }
 432     else
 433     {
 434         $max_narrow_dist = 0;
 435     }
 436
 437     $score_narrow = $max_narrow_dist;
 438
 439     $score_total = $score_len - $score_narrow - $score_diag;
 440
 441     return $score_total;
 442 }
 443
 444
 445 sub score_match_niels
 446 {
 447     # Niels Larsen, June 2004.
 448
 449     # Creates a crude "heuristic" attempt of telling how likely it is that a
 450     # given match occurs by chance in a given search space. If sequences are
 451     # given their composition is taken into account. The scoring punishes
 452     # distance from diagonal(s) and distance from previous match(es). Scores
 453     # range from zero and up, and lower is better.
 454
 455     my ( $match,   # Match array
 456          $q_seq,   # Either q_seq or s_seq
 457          $q_min,   # Lower bound search area (query sequence)
 458          $q_max,   # Upper bound search area (query sequence)
 459          $s_min,   # Lower bound search area (subject sequence)
 460          $s_max,   # Upper bound search area (subject sequence)
 461          ) = @_;
 462
 463     # Returns a positive number.
 464
 465     my ( $q_beg, $s_beg, $q_end, $s_end, $q_dim, $s_dim, $seq, $pos,
 466          $q_delta_beg, $s_delta_beg, $q_delta_end, $s_delta_end, $i,
 467          $delta_beg_max, $delta_end_max, $as, $gs, $ts, $cs, $pmatch,
 468          $score, $moves, $dist_beg, $dist_end, $seqlen, %chars, $delta,
 469          $delta_max );
 470
 471     $q_beg = $match->[Q_BEG];
 472     $q_end = $match->[Q_END];
 473     $s_beg = $match->[S_BEG];
 474     $s_end = $match->[S_END];
 475
 476     # >>>>>>>>>>>>>>>>>>>>>>> CRUDE INITIAL SCORE <<<<<<<<<<<<<<<<<<<<<<
 477
 478     # Get the probability of a match from the sequence composition (when
 479     # match is longer than 20 and sequence is given, otherwise use 0.25)
 480     # and raise that to the power of the length.
 481
 482     if ( $match->[LEN] > 20 and defined $q_seq )
 483     {
 484         $seq = substr ${ $q_seq }, $q_beg, $q_end-$q_beg+1;
 485         $seqlen = length $seq;
 486
 487         $chars{"a"} = $chars{"g"} = $chars{"c"} = $chars{"t"} = 0;
 488
 489         for ( $i = 0; $i < $seqlen; $i++ )
 490         {
 491             $chars{ substr $seq, $i, 1 }++;
 492         }
 493
 494         $pmatch = ($chars{"a"}/$seqlen)**2 + ($chars{"c"}/$seqlen)**2
 495                 + ($chars{"g"}/$seqlen)**2 + ($chars{"t"}/$seqlen)**2;
 496     }
 497     else {
 498         $pmatch = 0.25;
 499     }
 500
 501     $score = $pmatch ** ( $q_end - $q_beg + 1 );
 502
 503 #     # Punish by difference in height and width of search space,
 504
 505 #     $q_dim = $q_max - $q_min + 1;
 506 #     $s_dim = $s_max - $s_min + 1;
 507
 508 #     if ( $q_dim != $s_dim ) {
 509 #         $score *= abs ( $q_dim - $s_dim ) ** 2;
 510 #     }
 511
 512     return $score if $score > 0.25;
 513
 514     # Punish by how far the match is to the closest corner of the search
 515     # space,
 516
 517     $q_delta_beg = $q_beg - $q_min;
 518     $s_delta_beg = $s_beg - $s_min;
 519
 520     $q_delta_end = $q_max - $q_end;
 521     $s_delta_end = $s_max - $s_end;
 522
 523     if ( $q_delta_beg > $s_delta_beg ) {
 524         $delta_beg_max = $q_delta_beg;
 525     } else {
 526         $delta_beg_max = $s_delta_beg;
 527     }
 528
 529     if ( $q_delta_end > $s_delta_end ) {
 530         $delta_end_max = $q_delta_end;
 531     } else {
 532         $delta_end_max = $s_delta_end;
 533     }
 534
 535     if ( $delta_beg_max <= $delta_end_max ) {
 536         $score *= ($delta_beg_max+1) ** 2.0;
 537     } else {
 538         $score *= ($delta_end_max+1) ** 2.0;
 539     }
 540
 541     return $score if $score > 0.25;
 542
 543     # Add penalty if the match is towards the narrow end of the
 544     # search space,
 545
 546     if ( ($q_max - $q_min) <= ($s_max - $s_min) )
 547     {
 548         if ( $q_delta_beg > $s_delta_beg )
 549         {
 550             $score *= 2 * ( $q_delta_beg - $s_delta_beg ) ** 3;
 551         }
 552         elsif ( $q_delta_end > $s_delta_end )
 553         {
 554             $score *= 2 * ( $q_delta_end - $s_delta_end ) ** 3;
 555         }
 556     }
 557     else
 558     {
 559         if ( $s_delta_beg > $q_delta_beg )
 560         {
 561             $score *= 2 * ( $s_delta_beg - $q_delta_beg ) ** 3;
 562         }
 563         elsif ( $s_delta_end > $q_delta_end )
 564         {
 565             $score *= 2 * ( $s_delta_end - $q_delta_end ) ** 3;
 566         }
 567     }
 568
 569     if ( $score < 0 ) {
 570         print STDERR "q_min, q_max, s_min, s_max: $q_min, $q_max, $s_min, $s_max\n";
 571         die qq (Score <= 0 -> $score);
 572     }
 573
 574     return $score;
 575 }
 576
 577
 578 sub insert_indels
 579 {
 580     # Martin A. Hansen, June 2009.
 581
 582     # Given a list of matches and references to q_seq and s_seq,
 583     # insert indels to form aligned sequences.
 584
 585     my ( $matches,   # list of matches
 586          $q_seq,     # ref to query sequence
 587          $s_seq,     # ref to subject sequence
 588        ) = @_;
 589
 590     # Returns nothing.
 591
 592     my ( $q_indels, $s_indels, $match, $diff, $first );
 593
 594     @{ $matches } = sort { $a->[ Q_BEG ] <=> $b->[ Q_BEG ] } @{ $matches };
 595
 596     $first    = 1;
 597     $q_indels = 0;
 598     $s_indels = 0;
 599
 600     # ---- initial indels ----
 601
 602     $match = shift @{ $matches };
 603
 604     substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ], uc substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ];
 605     substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ], uc substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ];
 606
 607     $diff = ( $match->[ Q_BEG ] + $q_indels ) - ( $match->[ S_BEG ] + $s_indels );
 608
 609     if ( $diff < 0 )
 610     {
 611         substr ${ $q_seq }, 0, 0, '-' x abs $diff;
 612
 613         $q_indels += abs $diff;
 614     }
 615     elsif ( $diff > 0 )
 616     {
 617         substr ${ $s_seq }, 0, 0, '-' x abs $diff;
 618
 619         $s_indels += abs $diff;
 620     }
 621
 622     # ---- internal indels ----
 623
 624     foreach $match ( @{ $matches } )
 625     {
 626         substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ], uc substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ];
 627         substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ], uc substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ];
 628
 629         $diff = ( $match->[ Q_BEG ] + $q_indels ) - ( $match->[ S_BEG ] + $s_indels );
 630
 631         if ( $diff < 0 )
 632         {
 633             substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, 0, '-' x abs $diff;
 634
 635             $q_indels += abs $diff;
 636         }
 637         elsif ( $diff > 0 )
 638         {
 639             substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, 0, '-' x abs $diff;
 640
 641             $s_indels += abs $diff;
 642         }
 643     }
 644
 645     # ---- last indels ----
 646
 647     $diff = length( ${ $s_seq } ) - length( ${ $q_seq } );
 648
 649     if ( $diff > 0 ) {
 650          ${ $q_seq } .= '-' x abs $diff;
 651     } else {
 652          ${ $s_seq } .= '-' x abs $diff;
 653     }
 654 }
 655
 656
 657 sub align_sim_local
 658 {
 659     # Martin A. Hansen, May 2007.
 660
 661     # Calculate the local similarity of an alignment based on
 662     # an alignment chain. The similarity is calculated as
 663     # the number of matching residues divided by the overall
 664     # length of the alignment chain. This means that a short
 665     # but "good" alignment will yield a high similarity, while
 666     # a long "poor" alignment will yeild a low similarity.
 667
 668     my ( $chain,   # list of matches in alignment
 669        ) = @_;
 670
 671     # returns a float
 672
 673     my ( $match, $match_tot, $q_beg, $q_end, $s_beg, $s_end, $q_diff, $s_diff, $max, $sim );
 674
 675     return 0 if not @{ $chain };
 676
 677     $match_tot = 0;
 678     $q_end     = 0;
 679     $s_end     = 0;
 680     $q_beg     = 999999999;
 681     $s_beg     = 999999999;
 682
 683     foreach $match ( @{ $chain } )
 684     {
 685         $match_tot += $match->[ LEN ];
 686
 687         $q_beg = $match->[ Q_BEG ] if $match->[ Q_BEG ] < $q_beg;
 688         $s_beg = $match->[ S_BEG ] if $match->[ S_BEG ] < $s_beg;
 689
 690         $q_end = $match->[ Q_END ] if $match->[ Q_END ] > $q_end;
 691         $s_end = $match->[ S_END ] if $match->[ S_END ] > $s_end;
 692     }
 693
 694     $q_diff = $q_end - $q_beg + 1;
 695     $s_diff = $s_end - $s_beg + 1;
 696
 697     $max = Maasha::Calc::max( $q_diff, $s_diff );
 698
 699     $sim = sprintf( "%.2f", ( $match_tot / $max ) * 100 );
 700
 701     return $sim;
 702 }
 703
 704
 705 sub align_sim_global
 706 {
 707     # Martin A. Hansen, June 2007.
 708
 709     # Calculate the global similarity of an alignment based on
 710     # an alignment chain. The similarity is calculated as
 711     # the number of matching residues divided by the
 712     # length of the shortest sequence.
 713
 714     my ( $chain,   # list of matches in alignment
 715          $q_seq,   # ref to query sequence
 716          $s_seq,   # ref to subject sequence
 717        ) = @_;
 718
 719     # returns a float
 720
 721     my ( $match_tot, $min, $sim );
 722
 723     return 0 if not @{ $chain };
 724
 725     $match_tot = 0;
 726
 727     map { $match_tot += $_->[ LEN ] } @{ $chain };
 728
 729     $min = Maasha::Calc::min( length( ${ $q_seq } ), length( ${ $s_seq } ) );
 730
 731     $sim = sprintf( "%.2f", ( $match_tot / $min ) * 100 );
 732
 733     return $sim;
 734 }
 735
 736
 737 sub align_consensus
 738 {
 739     # Martin A. Hansen, June 2006.
 740
 741     # Given an alignment as a list of FASTA entries,
 742     # generates a consensus sequences based on the
 743     # entropies for each column similar to the way
 744     # a sequence logo i calculated. Returns the
 745     # consensus sequence as a FASTA entry.
 746
 747     my ( $entries,   # list of aligned FASTA entries
 748          $type,      # residue type - OPTIONAL
 749          $min_sim,   # minimum similarity - OPTIONAL
 750        ) = @_;
 751
 752     # Returns tuple
 753
 754     my ( $bit_max, $data, $pos, $char, $score, $entry );
 755
 756     $type    ||= Maasha::Seq::seq_guess_type( $entries->[ 0 ] );
 757     $min_sim ||= 50;
 758
 759     if ( $type =~ /protein/ ) {
 760         $bit_max = 4;
 761     } else {
 762         $bit_max = 2;
 763     }
 764
 765     $data = Maasha::Seq::seqlogo_calc( $bit_max, $entries );
 766
 767     foreach $pos ( @{ $data } )
 768     {
 769         ( $char, $score ) = @{ $pos->[ -1 ] };
 770
 771         if ( ( $score / $bit_max ) * 100 >= $min_sim ) {
 772             $entry->[ SEQ ] .= $char;
 773         } else {
 774             $entry->[ SEQ ] .= "-";
 775         }
 776     }
 777
 778     $entry->[ HEAD ] = "Consensus: $min_sim%";
 779
 780     return wantarray ? @{ $entry } : $entry;
 781 }
 782
 783
 784 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<