code_perl/Maasha/AlignTwoSeq.pm

   1 package Maasha::AlignTwoSeq;
   2
   3 # Copyright (C) 2007 Martin A. Hansen.
   4
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; either version 2
   8 # of the License, or (at your option) any later version.
   9
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18
  19 # http://www.gnu.org/copyleft/gpl.html
  20
  21
  22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  23
  24
  25 # Routines to create a pair-wise alignment.
  26
  27
  28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  29
  30
  31 use warnings;
  32 use strict;
  33 use Data::Dumper;
  34 use Maasha::Calc;
  35 use Maasha::Seq;
  36 use vars qw ( @ISA @EXPORT );
  37
  38 @ISA = qw( Exporter );
  39
  40 use constant {
  41     SCORE_FACTOR_LEN    => 3,
  42     SCORE_FACTOR_CORNER => 2,
  43     SCORE_FACTOR_DIAG   => 1,
  44     SCORE_FACTOR_NARROW => 0.5,
  45     VERBOSE             => 0,
  46 };
  47
  48
  49 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  50
  51
  52 sub align_two_seq
  53 {
  54     # Martin A. Hansen, August 2006.
  55
  56     # Generates an alignment by chaining matches, which are subsequences
  57     # shared between two sequences. The routine functions by considering
  58     # only matches within a given search space. If no matches are given
  59     # these will be located and matches will be included depending on a
  60     # calculated score. New search spaces spanning the spaces between
  61     # matches and the search space boundaries will be cast and recursed
  62     # into.
  63
  64     # Usage: align_two_seq( { Q_SEQ => \$q_seq, S_SEQ => \$s_seq }, [] );
  65
  66     my ( $space,     # search space
  67          $matches,   # list of matches
  68        ) = @_;
  69
  70     # Returns a list.
  71
  72     my ( @chain, $best_match, $left_space, $right_space );
  73
  74     new_space( $space );
  75
  76     matches_select( $matches, $space );
  77
  78     if ( scalar @{ $matches } == 0 )   # no matches - find some!
  79     {
  80         $matches = matches_find( $space );
  81
  82         if ( scalar @{ $matches } > 0 ) {
  83             push @chain, align_two_seq( $space, $matches );
  84         }
  85     }
  86     else   # matches are included according to score
  87     {
  88         $matches = matches_filter( $space, $matches );
  89
  90         $best_match = shift @{ $matches };
  91
  92         if ( $best_match )
  93         {
  94             push @chain, $best_match;
  95
  96             $left_space  = new_space_left( $best_match, $space );
  97             $right_space = new_space_right( $best_match, $space );
  98
  99             push @chain, align_two_seq( $left_space, $matches )  if defined $left_space;
 100             push @chain, align_two_seq( $right_space, $matches ) if defined $right_space;
 101         }
 102     }
 103
 104     return wantarray ? @chain : \@chain;
 105 }
 106
 107
 108 sub new_space
 109 {
 110     # Martin A. Hansen, August 2009.
 111
 112     # Define a new search space if not previously
 113     # defined. This occurs if align_two_seq() is
 114     # called initially with only the sequences.
 115
 116     my ( $space,   # search space
 117        ) = @_;
 118
 119     # Returns nothing.
 120
 121     if ( not defined $space->{ 'Q_MAX' } )
 122     {
 123         $space->{ 'Q_MIN' } = 0;
 124         $space->{ 'S_MIN' } = 0;
 125         $space->{ 'Q_MAX' } = length( ${ $space->{ 'Q_SEQ' } } ) - 1,
 126         $space->{ 'S_MAX' } = length( ${ $space->{ 'S_SEQ' } } ) - 1,
 127     }
 128 }
 129
 130
 131 sub new_space_left
 132 {
 133     # Martin A. Hansen, August 2009.
 134
 135     # Create a new search space between the beginning of
 136     # the current search space and the best match.
 137
 138     my ( $best_match,   # best match
 139          $space,        # search space
 140        ) = @_;
 141
 142     # Returns hashref.
 143
 144     my ( $new_space );
 145
 146     if ( $best_match->{ 'Q_BEG' } - $space->{ 'Q_MIN' } >= 2 and $best_match->{ 'S_BEG' } - $space->{ 'S_MIN' } >= 2 )
 147     {
 148         $new_space = {
 149             Q_SEQ => $space->{ 'Q_SEQ' },
 150             S_SEQ => $space->{ 'S_SEQ' },
 151             Q_MIN => $space->{ 'Q_MIN' },
 152             Q_MAX => $best_match->{ 'Q_BEG' } - 1,
 153             S_MIN => $space->{ 'S_MIN' },
 154             S_MAX => $best_match->{ 'S_BEG' } - 1,
 155         };
 156     }
 157
 158     return wantarray ? %{ $new_space } : $new_space;
 159 }
 160
 161
 162 sub new_space_right
 163 {
 164     # Martin A. Hansen, August 2009.
 165
 166     # Create a new search space between the best match
 167     # and the end of the current search space.
 168
 169     my ( $best_match,   # best match
 170          $space,        # search space
 171        ) = @_;
 172
 173     # Returns hashref.
 174
 175     my ( $new_space );
 176
 177     if ( $space->{ 'Q_MAX' } - $best_match->{ 'Q_END' } >= 2 and $space->{ 'S_MAX' } - $best_match->{ 'S_END' } >= 2 )
 178     {
 179         $new_space = {
 180             Q_SEQ => $space->{ 'Q_SEQ' },
 181             S_SEQ => $space->{ 'S_SEQ' },
 182             Q_MIN => $best_match->{ 'Q_END' } + 1,
 183             Q_MAX => $space->{ 'Q_MAX' },
 184             S_MIN => $best_match->{ 'S_END' } + 1,
 185             S_MAX => $space->{ 'S_MAX' },
 186         };
 187     }
 188
 189     return wantarray ? %{ $new_space } : $new_space;
 190 }
 191
 192
 193 sub matches_select
 194 {
 195     # Martin A. Hansen, August 2006.
 196
 197     # Given a list of matches and a search space,
 198     # include only those matches contained within
 199     # this search space.
 200
 201     my ( $matches,   # list of matches
 202          $space,     # search space
 203        ) = @_;
 204
 205     # Returns nothing.
 206
 207     @{ $matches } = grep { $_->{ 'Q_BEG' } >= $space->{ 'Q_MIN' } and
 208                            $_->{ 'S_BEG' } >= $space->{ 'S_MIN' } and
 209                            $_->{ 'Q_END' } <= $space->{ 'Q_MAX' } and
 210                            $_->{ 'S_END' } <= $space->{ 'S_MAX' } } @{ $matches };
 211 }
 212
 213
 214 sub word_size_calc
 215 {
 216     # Martin A. Hansen, August 2008.
 217
 218     # Given a search space calculates the wordsize for a word so a match
 219     # occurs only a few times. More matches may be needed at low similarity in
 220     # order to avoid starting with a wrong match.
 221
 222     my ( $space,   # search space
 223        ) = @_;
 224
 225     # Returns integer.
 226
 227     my ( $factor, $word_size );
 228
 229     $factor = 0.05;
 230
 231     if ( $space->{ 'Q_MAX' } - $space->{ 'Q_MIN' } < $space->{ 'S_MAX' } - $space->{ 'S_MIN' } ) {
 232         $word_size = int( $factor * ( $space->{ 'Q_MAX' } - $space->{ 'Q_MIN' } ) ) + 1;
 233     } else {
 234         $word_size = int( $factor * ( $space->{ 'S_MAX' } - $space->{ 'S_MIN' } ) ) + 1;
 235     }
 236
 237     return $word_size;
 238 }
 239
 240
 241 sub seq_index
 242 {
 243     # Martin A. Hansen, August 2009.
 244
 245     # Indexes a query sequence in a specified search space
 246     # Overlapping words are saved as keys in an index hash,
 247     # and a list of word postions are saved as value.
 248
 249     my ( $space,       # search space
 250          $word_size,   # word size
 251        ) = @_;
 252
 253     # Returns a hashref.
 254
 255     my ( $i, $word, %index );
 256
 257     for ( $i = $space->{ 'Q_MIN' }; $i <= $space->{ 'Q_MAX' } - $word_size + 1; $i++ )
 258     {
 259         $word = uc substr ${ $space->{ 'Q_SEQ' } }, $i, $word_size;
 260
 261         next if $word =~ tr/N//;   # Skip sequences with Ns.
 262
 263         push @{ $index{ $word } }, $i;
 264     }
 265
 266     return wantarray ? %index : \%index;
 267 }
 268
 269
 270 sub seq_scan
 271 {
 272     # Martin A. Hansen, August 2009.
 273
 274     my ( $index,       # sequence index
 275          $space,       # search space
 276          $word_size,   # word size
 277        ) = @_;
 278
 279     # Returns a list.
 280
 281     my ( $i, $word, $pos, $match, @matches, $redundant );
 282
 283     $redundant = {};
 284
 285     for ( $i = $space->{ 'S_MIN' }; $i <= $space->{ 'S_MAX' } - $word_size + 1; $i++ )
 286     {
 287         $word = uc substr ${ $space->{ 'S_SEQ' } }, $i, $word_size;
 288
 289         if ( exists $index->{ $word } )
 290         {
 291             foreach $pos ( @{ $index->{ $word } } )
 292             {
 293                 $match = {
 294                     'Q_BEG' => $pos,
 295                     'Q_END' => $pos + $word_size - 1,
 296                     'S_BEG' => $i,
 297                     'S_END' => $i + $word_size - 1,
 298                     'LEN'   => $word_size,
 299                     'SCORE' => 0,
 300                 };
 301
 302                 if ( not match_redundant( $match, $redundant ) )
 303                 {
 304                     match_expand( $match, $space );
 305
 306                     push @matches, $match;
 307
 308                     match_redundant_add( $match, $redundant );
 309                 }
 310             }
 311         }
 312     }
 313
 314     return wantarray ? @matches : \@matches;
 315 }
 316
 317
 318 sub matches_find
 319 {
 320     # Martin A. Hansen, November 2006
 321
 322     # Given two sequences, find all maximum expanded matches between these.
 323
 324     my ( $space,   # search space
 325        ) = @_;
 326
 327     # returns list of matches
 328
 329     my ( $word_size, $index, $matches );
 330
 331     $word_size = word_size_calc( $space );
 332
 333     $matches = [];
 334
 335     while ( scalar @{ $matches } == 0 and $word_size > 0 )
 336     {
 337         $index   = seq_index( $space, $word_size );
 338         $matches = seq_scan( $index, $space, $word_size );
 339
 340         $word_size--;
 341     }
 342
 343     return wantarray ? @{ $matches } : $matches;
 344 }
 345
 346
 347 sub match_expand
 348 {
 349     # Martin A. Hansen, August 2009.
 350
 351     # Given a match and a search space hash,
 352     # expand the match maximally both forward and
 353     # backward direction.
 354
 355     my ( $match,   # match to expand
 356          $space,   # search space
 357        ) = @_;
 358
 359     # Returns nothing.
 360
 361     match_expand_forward( $match, $space );
 362     match_expand_backward( $match, $space );
 363 }
 364
 365
 366 sub match_expand_forward
 367 {
 368     # Martin A. Hansen, August 2009.
 369
 370     # Given a match and a search space hash,
 371     # expand the match maximally in the forward
 372     # direction.
 373
 374     my ( $match,   # match to expand
 375          $space,   # search space
 376        ) = @_;
 377
 378     # Returns nothing.
 379
 380     while ( $match->{ 'Q_END' } <= $space->{ 'Q_MAX' } and
 381             $match->{ 'S_END' } <= $space->{ 'S_MAX' } and
 382             uc substr( ${ $space->{ 'Q_SEQ' } }, $match->{ 'Q_END' }, 1 ) eq uc substr( ${ $space->{ 'S_SEQ' } }, $match->{ 'S_END' }, 1 ) )
 383     {
 384         $match->{ 'Q_END' }++;
 385         $match->{ 'S_END' }++;
 386         $match->{ 'LEN' }++;
 387     }
 388
 389     $match->{ 'Q_END' }--;
 390     $match->{ 'S_END' }--;
 391     $match->{ 'LEN' }--;
 392 }
 393
 394
 395 sub match_expand_backward
 396 {
 397     # Martin A. Hansen, August 2009.
 398
 399     # Given a match and a search space hash,
 400     # expand the match maximally in the backward
 401     # direction.
 402
 403     my ( $match,   # match to expand
 404          $space,   # search space
 405        ) = @_;
 406
 407     # Returns nothing.
 408
 409     while ( $match->{ 'Q_BEG' } >= $space->{ 'Q_MIN' } and
 410             $match->{ 'S_BEG' } >= $space->{ 'S_MIN' } and
 411             uc substr( ${ $space->{ 'Q_SEQ' } }, $match->{ 'Q_BEG' }, 1 ) eq uc substr( ${ $space->{ 'S_SEQ' } }, $match->{ 'S_BEG' }, 1 ) )
 412     {
 413         $match->{ 'Q_BEG'}--;
 414         $match->{ 'S_BEG'}--;
 415         $match->{ 'LEN' }++;
 416     }
 417
 418     $match->{ 'Q_BEG'}++;
 419     $match->{ 'S_BEG'}++;
 420     $match->{ 'LEN' }--;
 421 }
 422
 423
 424 sub match_redundant
 425 {
 426     # Martin A. Hansen, August 2009
 427
 428     my ( $new_match,   # match
 429          $redundant,   # hashref
 430        ) = @_;
 431
 432     # Returns boolean.
 433
 434     my ( $match );
 435
 436     foreach $match ( @{ $redundant->{ $new_match->{ 'Q_BEG' } } } )
 437     {
 438         if ( $new_match->{ 'S_BEG' } >= $match->{ 'S_BEG' } and $new_match->{ 'S_END' } <= $match->{ 'S_END' } ) {
 439             return 1;
 440         }
 441     }
 442
 443     return 0;
 444 }
 445
 446
 447 sub match_redundant_add
 448 {
 449     # Martin A. Hansen, August 2009
 450
 451     my ( $match,       # match
 452          $redundant,   # hash ref
 453        ) = @_;
 454
 455     # Returns nothing.
 456
 457     map { push @{ $redundant->{ $_ } }, $match } ( $match->{ 'Q_BEG' } .. $match->{ 'Q_END' } );
 458 }
 459
 460
 461 sub matches_filter
 462 {
 463     # Martin A. Hansen, August 2009.
 464
 465     my ( $space,     # search space
 466          $matches,   # list of matches
 467        ) = @_;
 468
 469     # Returns a list.
 470
 471     my ( $best_match, $match, @new_matches );
 472
 473     $best_match = shift @{ $matches };
 474
 475     match_score( $best_match, $space );
 476
 477     foreach $match ( @{ $matches } )
 478     {
 479         match_score( $match, $space );
 480
 481         if ( $match->{ 'SCORE' } > 0 )
 482         {
 483             if ( $match->{ 'SCORE' } > $best_match->{ 'SCORE' } ) {
 484                 $best_match = $match;
 485             } else {
 486                 push @new_matches, $match;
 487             }
 488         }
 489     }
 490
 491     unshift @new_matches, $best_match if $best_match->{ 'SCORE' } > 0;
 492
 493     return wantarray ? @new_matches : \@new_matches;
 494 }
 495
 496
 497 sub match_score
 498 {
 499     # Martin A. Hansen, August 2009
 500
 501     # Given a match and a search space scores the match according to three criteria:
 502
 503     # 1) length of match - the longer the better.
 504     # 2) distance to closest corner - the shorter the better.
 505     # 3) distance to closest narrow end of the search space - the shorter the better.
 506
 507     # Each of these scores are divided by search space dimensions, and the total score
 508     # is calculated: total = score_len - score_corner - score_narrow.
 509
 510     # The higher the score, the better the match.
 511
 512     my ( $match,   # match
 513          $space,   # search space
 514        ) = @_;
 515
 516     # Returns nothing.
 517
 518     my ( $score_len, $score_corner, $score_diag, $score_narrow, $score_total );
 519
 520     $score_len    = match_score_len( $match );
 521     $score_corner = match_score_corner( $match, $space );
 522     $score_diag   = match_score_diag( $match, $space );
 523     $score_narrow = match_score_narrow( $match, $space );
 524
 525     $score_total  = $score_len - $score_corner - $score_diag - $score_narrow;
 526
 527     printf STDERR "LEN: %d   CORNER: %d   DIAG: %d   NARROW: %d   TOTAL: %d\n", $score_len, $score_corner, $score_diag, $score_narrow, $score_total if VERBOSE;
 528
 529     $match->{ 'SCORE' } = $score_total;
 530 }
 531
 532
 533 sub match_score_len
 534 {
 535     # Martin A. Hansen, August 2009
 536
 537     # Score according to match length.
 538
 539     my ( $match,   # match
 540        ) = @_;
 541
 542     # Returns integer.
 543
 544     return $match->{ 'LEN' } * SCORE_FACTOR_LEN;
 545 }
 546
 547
 548 sub match_score_corner
 549 {
 550     # Martin A. Hansen, August 2009
 551
 552     # Score according to distance from corner.
 553
 554     my ( $match,   # match
 555          $space,   # search space
 556        ) = @_;
 557
 558     # Returns integer.
 559
 560     my ( $left_dist, $right_dist, $score_corner );
 561
 562     $left_dist  = Maasha::Calc::dist_point2point( $space->{ 'Q_MIN' }, $space->{ 'S_MIN' }, $match->{ 'Q_BEG' } , $match->{ 'S_BEG' } );
 563     $right_dist = Maasha::Calc::dist_point2point( $space->{ 'Q_MAX' }, $space->{ 'S_MAX' }, $match->{ 'Q_END' } , $match->{ 'S_END' } );
 564
 565     $score_corner = Maasha::Calc::min( $left_dist, $right_dist );
 566
 567     return $score_corner * SCORE_FACTOR_CORNER;
 568 }
 569
 570
 571 sub match_score_diag
 572 {
 573     # Martin A. Hansen, August 2009
 574
 575     # Score according to distance from diagonal.
 576
 577     my ( $match,   # match
 578          $space,   # search space
 579        ) = @_;
 580
 581     # Returns integer.
 582
 583     my ( $q_dim, $s_dim, $beg_diag_dist, $end_diag_dist, $min_diag_dist, $score_diag );
 584
 585     $q_dim = $space->{ 'Q_MAX' } - $space->{ 'Q_MIN' } + 1;
 586     $s_dim = $space->{ 'S_MAX' } - $space->{ 'S_MIN' } + 1;
 587
 588     if ( $q_dim >= $s_dim ) # s_dim is the narrow end
 589     {
 590         $beg_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' },
 591                                                         $match->{ 'S_BEG' },
 592                                                         $space->{ 'Q_MIN' },
 593                                                         $space->{ 'S_MIN' },
 594                                                         $space->{ 'Q_MIN' } + $s_dim,
 595                                                         $space->{ 'S_MIN' } + $s_dim );
 596
 597         $end_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' },
 598                                                         $match->{ 'S_BEG' },
 599                                                         $space->{ 'Q_MAX' } - $s_dim,
 600                                                         $space->{ 'S_MAX' } - $s_dim,
 601                                                         $space->{ 'Q_MAX' },
 602                                                         $space->{ 'S_MAX' } );
 603     }
 604     else
 605     {
 606         $beg_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' },
 607                                                         $match->{ 'S_BEG' },
 608                                                         $space->{ 'Q_MIN' },
 609                                                         $space->{ 'S_MIN' },
 610                                                         $space->{ 'Q_MIN' } + $q_dim,
 611                                                         $space->{ 'S_MIN' } + $q_dim );
 612
 613         $end_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' },
 614                                                         $match->{ 'S_BEG' },
 615                                                         $space->{ 'Q_MAX' } - $q_dim,
 616                                                         $space->{ 'S_MAX' } - $q_dim,
 617                                                         $space->{ 'Q_MAX' },
 618                                                         $space->{ 'S_MAX' } );
 619     }
 620
 621     $min_diag_dist = Maasha::Calc::min( $beg_diag_dist, $end_diag_dist );
 622
 623     $score_diag = $min_diag_dist;
 624
 625     return $score_diag * SCORE_FACTOR_DIAG;
 626 }
 627
 628
 629 sub match_score_narrow
 630 {
 631     # Martin A. Hansen, August 2009
 632
 633     # Score according to distance to the narrow end of the search space.
 634
 635     my ( $match,   # match
 636          $space,   # search space
 637        ) = @_;
 638
 639     # Returns integer.
 640
 641     my ( $max_narrow_dist, $q_dim, $s_dim, $beg_narrow_dist, $end_narrow_dist, $score_narrow );
 642
 643     $max_narrow_dist = 0;
 644
 645     $q_dim = $space->{ 'Q_MAX' } - $space->{ 'Q_MIN' } + 1;
 646     $s_dim = $space->{ 'S_MAX' } - $space->{ 'S_MIN' } + 1;
 647
 648     if ( $q_dim > $s_dim ) # s_dim is the narrow end
 649     {
 650         $beg_narrow_dist = $match->{ 'Q_BEG' } - $space->{ 'Q_MIN' };
 651         $end_narrow_dist = $space->{ 'Q_MAX' } - $match->{ 'Q_BEG' };
 652
 653         $max_narrow_dist = Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist );
 654     }
 655     elsif ( $q_dim < $s_dim ) # q_dim is the narrow end
 656     {
 657         $beg_narrow_dist = $match->{ 'S_BEG' } - $space->{ 'S_MIN' };
 658         $end_narrow_dist = $space->{ 'S_MAX' } - $match->{ 'S_BEG' };
 659
 660         $max_narrow_dist = Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist );
 661     }
 662
 663     $score_narrow = $max_narrow_dist;
 664
 665     return $score_narrow * SCORE_FACTOR_NARROW;
 666 }
 667
 668
 669 sub shift_case
 670 {
 671     # Martin A. Hansen, August 2009.
 672
 673     # Given a list of matches and sequence references
 674     # uppercase only matching sequnce while lowercasing the rest.
 675
 676     my ( $matches,   # list of matches
 677          $q_seq,     # query sequence ref
 678          $s_seq,     # subject sequence ref
 679        ) = @_;
 680
 681     # Returns nothing.
 682
 683     my ( $match );
 684
 685     ${ $q_seq } = lc ${ $q_seq };
 686     ${ $s_seq } = lc ${ $s_seq };
 687
 688     foreach $match ( @{ $matches } )
 689     {
 690         substr ${ $q_seq }, $match->{ 'Q_BEG' }, $match->{ 'LEN' }, uc substr ${ $q_seq }, $match->{ 'Q_BEG' }, $match->{ 'LEN' };
 691         substr ${ $s_seq }, $match->{ 'S_BEG' }, $match->{ 'LEN' }, uc substr ${ $s_seq }, $match->{ 'S_BEG' }, $match->{ 'LEN' };
 692     }
 693 }
 694
 695
 696
 697 sub insert_indels
 698 {
 699     # Martin A. Hansen, August 2009.
 700
 701     # Given a list of matches and sequence references
 702     # insert indels to form aligned sequences.
 703
 704     my ( $matches,   # list of matches
 705          $q_seq,     # query sequence ref
 706          $s_seq,     # subject sequence ref
 707        ) = @_;
 708
 709     my ( $i, $q_dist, $s_dist, $q_indels, $s_indels, $diff );
 710
 711     $q_indels = 0;
 712     $s_indels = 0;
 713
 714     if ( @{ $matches } > 0 )
 715     {
 716         @{ $matches } = sort { $a->{ 'Q_BEG' } <=> $b->{ 'Q_BEG' } } @{ $matches };  # FIXME is this necessary?
 717
 718         # >>>>>>>>>>>>>> FIRST MATCH <<<<<<<<<<<<<<
 719
 720         $diff = $matches->[ 0 ]->{ 'Q_BEG' } - $matches->[ 0 ]->{ 'S_BEG' };
 721
 722         if ( $diff > 0 )
 723         {
 724             substr ${ $s_seq }, 0, 0, '-' x $diff;
 725
 726             $s_indels += $diff;
 727         }
 728         elsif ( $diff < 0 )
 729         {
 730             substr ${ $q_seq }, 0, 0, '-' x abs $diff;
 731
 732             $q_indels += abs $diff;
 733         }
 734
 735         # >>>>>>>>>>>>>> MIDDLE MATCHES <<<<<<<<<<<<<<
 736
 737         for ( $i = 0; $i < scalar @{ $matches } - 1; $i++ )
 738         {
 739             $q_dist = $matches->[ $i + 1 ]->{ 'Q_BEG' } - $matches->[ $i ]->{ 'Q_END' };
 740             $s_dist = $matches->[ $i + 1 ]->{ 'S_BEG' } - $matches->[ $i ]->{ 'S_END' };
 741
 742             $diff   = $q_dist - $s_dist;
 743
 744             if ( $diff > 0 )
 745             {
 746                 substr ${ $s_seq }, $s_indels + $matches->[ $i ]->{ 'S_END' } + 1, 0, '-' x $diff;
 747
 748                 $s_indels += $diff;
 749             }
 750             elsif ( $diff < 0 )
 751             {
 752                 substr ${ $q_seq }, $q_indels + $matches->[ $i ]->{ 'Q_END' } + 1, 0, '-' x abs $diff;
 753
 754                 $q_indels += abs $diff;
 755             }
 756         }
 757     }
 758
 759     # >>>>>>>>>>>>>> LAST MATCH <<<<<<<<<<<<<<
 760
 761     $diff = length( ${ $q_seq } ) - length( ${ $s_seq } );
 762
 763     if ( $diff > 0 )
 764     {
 765         ${ $s_seq } .= '-' x $diff;
 766
 767         $s_indels += $diff;
 768     }
 769     else
 770     {
 771         ${ $q_seq } .= '-' x abs $diff;
 772
 773         $q_indels += abs $diff;
 774     }
 775 }
 776
 777 1;
 778
 779 __END__
 780
 781
 782 sub align_sim_local
 783 {
 784     # Martin A. Hansen, May 2007.
 785
 786     # Calculate the local similarity of an alignment based on
 787     # an alignment chain. The similarity is calculated as
 788     # the number of matching residues divided by the overall
 789     # length of the alignment chain. This means that a short
 790     # but "good" alignment will yield a high similarity, while
 791     # a long "poor" alignment will yeild a low similarity.
 792
 793     my ( $chain,   # list of matches in alignment
 794        ) = @_;
 795
 796     # returns a float
 797
 798     my ( $match, $match_tot, $q_beg, $q_end, $s_beg, $s_end, $q_diff, $s_diff, $max, $sim );
 799
 800     return 0 if not @{ $chain };
 801
 802     $match_tot = 0;
 803     $q_end     = 0;
 804     $s_end     = 0;
 805     $q_beg     = 999999999;
 806     $s_beg     = 999999999;
 807
 808     foreach $match ( @{ $chain } )
 809     {
 810         $match_tot += $match->[ LEN ];
 811
 812         $q_beg = $match->[ Q_BEG ] if $match->[ Q_BEG ] < $q_beg;
 813         $s_beg = $match->[ S_BEG ] if $match->[ S_BEG ] < $s_beg;
 814
 815         $q_end = $match->[ Q_END ] if $match->[ Q_END ] > $q_end;
 816         $s_end = $match->[ S_END ] if $match->[ S_END ] > $s_end;
 817     }
 818
 819     $q_diff = $q_end - $q_beg + 1;
 820     $s_diff = $s_end - $s_beg + 1;
 821
 822     $max = Maasha::Calc::max( $q_diff, $s_diff );
 823
 824     $sim = sprintf( "%.2f", ( $match_tot / $max ) * 100 );
 825
 826     return $sim;
 827 }
 828
 829
 830 sub align_sim_global
 831 {
 832     # Martin A. Hansen, June 2007.
 833
 834     # Calculate the global similarity of an alignment based on
 835     # an alignment chain. The similarity is calculated as
 836     # the number of matching residues divided by the
 837     # length of the shortest sequence.
 838
 839     my ( $chain,   # list of matches in alignment
 840          $q_seq,   # ref to query sequence
 841          $s_seq,   # ref to subject sequence
 842        ) = @_;
 843
 844     # returns a float
 845
 846     my ( $match_tot, $min, $sim );
 847
 848     return 0 if not @{ $chain };
 849
 850     $match_tot = 0;
 851
 852     map { $match_tot += $_->[ LEN ] } @{ $chain };
 853
 854     $min = Maasha::Calc::min( length( ${ $q_seq } ), length( ${ $s_seq } ) );
 855
 856     $sim = sprintf( "%.2f", ( $match_tot / $min ) * 100 );
 857
 858     return $sim;
 859 }
 860
 861
 862 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<