code_perl/Maasha/AlignTwoSeq.pm

   1 package Maasha::AlignTwoSeq;
   2
   3 # Copyright (C) 2007 Martin A. Hansen.
   4
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; either version 2
   8 # of the License, or (at your option) any later version.
   9
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18
  19 # http://www.gnu.org/copyleft/gpl.html
  20
  21
  22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  23
  24
  25 # Routines to create a pair-wise alignment.
  26
  27
  28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  29
  30
  31 use warnings;
  32 use strict;
  33 use Data::Dumper;
  34 use Maasha::Calc;
  35 use Maasha::Seq;
  36 use vars qw ( @ISA @EXPORT );
  37
  38 @ISA = qw( Exporter );
  39
  40
  41 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  42
  43
  44 sub align_two_seq
  45 {
  46     # Martin A. Hansen, August 2006.
  47
  48     # Generates an alignment by chaining matches, which are subsequences
  49     # shared between two sequences. The routine functions by considering
  50     # only matches within a given search space. If no matches are given
  51     # these will be generated, if long matches are found these will be
  52     # included in the alignment, otherwise matches will be included depending
  53     # on a calculated score. New search spaces spanning the spaces between
  54     # matches and the search space boundaries will be cast and recursed into.
  55
  56     # Usage: align_two_seq( { Q_SEQ => \$q_seq, S_SEQ => \$s_seq }, [] );
  57
  58     my ( $space,     # search space
  59          $matches,   # list of matches
  60        ) = @_;
  61
  62     # Returns a list.
  63
  64     my ( @chain, $best_match, $left_space, $right_space );
  65
  66     new_space( $space );
  67
  68     matches_select( $matches, $space );
  69
  70     if ( scalar @{ $matches } == 0 )   # no matches - find some!
  71     {
  72         $matches = matches_find( $space );
  73
  74         if ( scalar @{ $matches } > 0 ) {
  75             push @chain, align_two_seq( $space, $matches );
  76         }
  77     }
  78     else   # matches are included according to score
  79     {
  80         $matches = matches_filter( $space, $matches );
  81
  82         $best_match = shift @{ $matches };
  83
  84         if ( $best_match )
  85         {
  86             push @chain, $best_match;
  87
  88             $left_space  = new_space_left( $best_match, $space );
  89             $right_space = new_space_right( $best_match, $space );
  90
  91             push @chain, align_two_seq( $left_space, $matches )  if defined $left_space;
  92             push @chain, align_two_seq( $right_space, $matches ) if defined $right_space;
  93         }
  94     }
  95
  96     return wantarray ? @chain : \@chain;
  97 }
  98
  99
 100 sub new_space
 101 {
 102     # Martin A. Hansen, August 2009.
 103
 104     # Define a new search space if not previously
 105     # defined. This occurs if align_two_seq() is
 106     # called initially with only the sequences.
 107
 108     my ( $space,   # search space
 109        ) = @_;
 110
 111     # Returns nothing.
 112
 113     if ( not defined $space->{ 'Q_MAX' } )
 114     {
 115         $space->{ 'Q_MIN' } = 0;
 116         $space->{ 'S_MIN' } = 0;
 117         $space->{ 'Q_MAX' } = length( ${ $space->{ 'Q_SEQ' } } ) - 1,
 118         $space->{ 'S_MAX' } = length( ${ $space->{ 'S_SEQ' } } ) - 1,
 119     }
 120 }
 121
 122
 123 sub new_space_left
 124 {
 125     # Martin A. Hansen, August 2009.
 126
 127     # Create a new search space between the beginning of
 128     # the current search space and the best match.
 129
 130     my ( $best_match,   # best match
 131          $space,        # search space
 132        ) = @_;
 133
 134     # Returns hashref.
 135
 136     my ( $new_space );
 137
 138     if ( $best_match->{ 'Q_BEG' } - $space->{ 'Q_MIN' } >= 2 and $best_match->{ 'S_BEG' } - $space->{ 'S_MIN' } >= 2 )
 139     {
 140         $new_space = {
 141             Q_SEQ => $space->{ 'Q_SEQ' },
 142             S_SEQ => $space->{ 'S_SEQ' },
 143             Q_MIN => $space->{ 'Q_MIN' },
 144             Q_MAX => $best_match->{ 'Q_BEG' } - 1,
 145             S_MIN => $space->{ 'S_MIN' },
 146             S_MAX => $best_match->{ 'S_BEG' } - 1,
 147         };
 148     }
 149
 150     return wantarray ? %{ $new_space } : $new_space;
 151 }
 152
 153
 154 sub new_space_right
 155 {
 156     # Martin A. Hansen, August 2009.
 157
 158     # Create a new search space between the best match
 159     # and the end of the current search space.
 160
 161     my ( $best_match,   # best match
 162          $space,        # search space
 163        ) = @_;
 164
 165     # Returns hashref.
 166
 167     my ( $new_space );
 168
 169     if ( $space->{ 'Q_MAX' } - $best_match->{ 'Q_END' } >= 2 and $space->{ 'S_MAX' } - $best_match->{ 'S_END' } >= 2 )
 170     {
 171         $new_space = {
 172             Q_SEQ => $space->{ 'Q_SEQ' },
 173             S_SEQ => $space->{ 'S_SEQ' },
 174             Q_MIN => $best_match->{ 'Q_END' } + 1,
 175             Q_MAX => $space->{ 'Q_MAX' },
 176             S_MIN => $best_match->{ 'S_END' } + 1,
 177             S_MAX => $space->{ 'S_MAX' },
 178         };
 179     }
 180
 181     return wantarray ? %{ $new_space } : $new_space;
 182 }
 183
 184
 185 sub matches_select
 186 {
 187     # Martin A. Hansen, August 2006.
 188
 189     # Given a list of matches and a search space,
 190     # include only those matches contained within
 191     # this search space.
 192
 193     my ( $matches,   # list of matches
 194          $space,     # search space
 195        ) = @_;
 196
 197     # Returns nothing.
 198
 199     @{ $matches } = grep { $_->{ 'Q_BEG' } >= $space->{ 'Q_MIN' } and
 200                            $_->{ 'S_BEG' } >= $space->{ 'S_MIN' } and
 201                            $_->{ 'Q_END' } <= $space->{ 'Q_MAX' } and
 202                            $_->{ 'S_END' } <= $space->{ 'S_MAX' } } @{ $matches };
 203 }
 204
 205
 206 sub word_size_calc
 207 {
 208     # Martin A. Hansen, August 2008.
 209
 210     # Given a search space calculates the wordsize for a word so a match
 211     # occurs only a few times. More matches may be needed at low similarity in
 212     # order to avoid starting with a wrong match.
 213
 214     my ( $space,   # search space
 215        ) = @_;
 216
 217     # Returns integer.
 218
 219     my ( $factor, $word_size );
 220
 221     $factor = 0.05;
 222
 223     if ( $space->{ 'Q_MAX' } - $space->{ 'Q_MIN' } < $space->{ 'S_MAX' } - $space->{ 'S_MIN' } ) {
 224         $word_size = int( $factor * ( $space->{ 'Q_MAX' } - $space->{ 'Q_MIN' } ) ) + 1;
 225     } else {
 226         $word_size = int( $factor * ( $space->{ 'S_MAX' } - $space->{ 'S_MIN' } ) ) + 1;
 227     }
 228
 229     return $word_size;
 230 }
 231
 232
 233 sub seq_index
 234 {
 235     # Martin A. Hansen, August 2009.
 236
 237     # Indexes a query sequence in a specified search space
 238     # Overlapping words are saved as keys in an index hash,
 239     # and a list of word postions are saved as value.
 240
 241     my ( $space,       # search space
 242          $word_size,   # word size
 243        ) = @_;
 244
 245     # Returns a hashref.
 246
 247     my ( $i, $word, %index );
 248
 249     for ( $i = $space->{ 'Q_MIN' }; $i <= $space->{ 'Q_MAX' } - $word_size + 1; $i++ )
 250     {
 251         $word = uc substr ${ $space->{ 'Q_SEQ' } }, $i, $word_size;
 252
 253         next if $word =~ tr/N//;   # Skip sequences with Ns.
 254
 255         push @{ $index{ $word } }, $i;
 256     }
 257
 258     return wantarray ? %index : \%index;
 259 }
 260
 261
 262 sub seq_scan
 263 {
 264     # Martin A. Hansen, August 2009.
 265
 266     my ( $index,       # sequence index
 267          $space,       # search space
 268          $word_size,   # word size
 269        ) = @_;
 270
 271     # Returns a list.
 272
 273     my ( $i, $word, $pos, $match, @matches, $redundant );
 274
 275     $redundant = {};
 276
 277     for ( $i = $space->{ 'S_MIN' }; $i <= $space->{ 'S_MAX' } - $word_size + 1; $i++ )
 278     {
 279         $word = uc substr ${ $space->{ 'S_SEQ' } }, $i, $word_size;
 280
 281         if ( exists $index->{ $word } )
 282         {
 283             foreach $pos ( @{ $index->{ $word } } )
 284             {
 285                 $match = {
 286                     'Q_BEG' => $pos,
 287                     'Q_END' => $pos + $word_size - 1,
 288                     'S_BEG' => $i,
 289                     'S_END' => $i + $word_size - 1,
 290                     'LEN'   => $word_size,
 291                     'SCORE' => 0,
 292                 };
 293
 294                 if ( not match_redundant( $match, $redundant ) )
 295                 {
 296                     match_expand( $match, $space );
 297
 298                     push @matches, $match;
 299
 300                     match_redundant_add( $match, $redundant );
 301                 }
 302             }
 303         }
 304     }
 305
 306     return wantarray ? @matches : \@matches;
 307 }
 308
 309
 310 sub matches_find
 311 {
 312     # Martin A. Hansen, November 2006
 313
 314     # Given two sequences, find all maximum expanded matches between these.
 315
 316     my ( $space,   # search space
 317        ) = @_;
 318
 319     # returns list of matches
 320
 321     my ( $word_size, $index, $matches );
 322
 323     $word_size = word_size_calc( $space );
 324
 325     $matches = [];
 326
 327     while ( scalar @{ $matches } == 0 and $word_size > 0 )
 328     {
 329         $index   = seq_index( $space, $word_size );
 330         $matches = seq_scan( $index, $space, $word_size );
 331
 332         $word_size--;
 333     }
 334
 335     return wantarray ? @{ $matches } : $matches;
 336 }
 337
 338
 339 sub match_expand
 340 {
 341     # Martin A. Hansen, August 2009.
 342
 343     # Given a match and a search space hash,
 344     # expand the match maximally both forward and
 345     # backward direction.
 346
 347     my ( $match,   # match to expand
 348          $space,   # search space
 349        ) = @_;
 350
 351     # Returns nothing.
 352
 353     match_expand_forward( $match, $space );
 354     match_expand_backward( $match, $space );
 355 }
 356
 357
 358 sub match_expand_forward
 359 {
 360     # Martin A. Hansen, August 2009.
 361
 362     # Given a match and a search space hash,
 363     # expand the match maximally in the forward
 364     # direction.
 365
 366     my ( $match,   # match to expand
 367          $space,   # search space
 368        ) = @_;
 369
 370     # Returns nothing.
 371
 372     while ( $match->{ 'Q_END' } <= $space->{ 'Q_MAX' } and
 373             $match->{ 'S_END' } <= $space->{ 'S_MAX' } and
 374             uc substr( ${ $space->{ 'Q_SEQ' } }, $match->{ 'Q_END' }, 1 ) eq uc substr( ${ $space->{ 'S_SEQ' } }, $match->{ 'S_END' }, 1 ) )
 375     {
 376         $match->{ 'Q_END' }++;
 377         $match->{ 'S_END' }++;
 378         $match->{ 'LEN' }++;
 379     }
 380
 381     $match->{ 'Q_END' }--;
 382     $match->{ 'S_END' }--;
 383     $match->{ 'LEN' }--;
 384 }
 385
 386
 387 sub match_expand_backward
 388 {
 389     # Martin A. Hansen, August 2009.
 390
 391     # Given a match and a search space hash,
 392     # expand the match maximally in the backward
 393     # direction.
 394
 395     my ( $match,   # match to expand
 396          $space,   # search space
 397        ) = @_;
 398
 399     # Returns nothing.
 400
 401     while ( $match->{ 'Q_BEG' } >= $space->{ 'Q_MIN' } and
 402             $match->{ 'S_BEG' } >= $space->{ 'S_MIN' } and
 403             uc substr( ${ $space->{ 'Q_SEQ' } }, $match->{ 'Q_BEG' }, 1 ) eq uc substr( ${ $space->{ 'S_SEQ' } }, $match->{ 'S_BEG' }, 1 ) )
 404     {
 405         $match->{ 'Q_BEG'}--;
 406         $match->{ 'S_BEG'}--;
 407         $match->{ 'LEN' }++;
 408     }
 409
 410     $match->{ 'Q_BEG'}++;
 411     $match->{ 'S_BEG'}++;
 412     $match->{ 'LEN' }--;
 413 }
 414
 415
 416 sub match_redundant
 417 {
 418     # Martin A. Hansen, August 2009
 419
 420     my ( $new_match,       # match
 421          $redundant,   # hashref
 422        ) = @_;
 423
 424     # Returns boolean.
 425
 426     my ( $match );
 427
 428     foreach $match ( @{ $redundant->{ $new_match->{ 'Q_BEG' } } } )
 429     {
 430         if ( $new_match->{ 'S_BEG' } >= $match->{ 'S_BEG' } and $new_match->{ 'S_END' } <= $match->{ 'S_END' } ) {
 431             return 1;
 432         }
 433     }
 434
 435     return 0;
 436 }
 437
 438
 439 sub match_redundant_add
 440 {
 441     # Martin A. Hansen, August 2009
 442
 443     my ( $match,       # match
 444          $redundant,   # hash ref
 445        ) = @_;
 446
 447     # Returns nothing.
 448
 449     map { push @{ $redundant->{ $_ } }, $match } ( $match->{ 'Q_BEG' } .. $match->{ 'Q_END' } );
 450 }
 451
 452
 453 sub matches_filter
 454 {
 455     # Martin A. Hansen, August 2009.
 456
 457     my ( $space,     # search space
 458          $matches,   # list of matches
 459        ) = @_;
 460
 461     # Returns a list.
 462
 463     my ( $best_match, $match, @new_matches );
 464
 465     $best_match = shift @{ $matches };
 466
 467     match_score( $best_match, $space );
 468
 469     foreach $match ( @{ $matches } )
 470     {
 471         match_score( $match, $space );
 472
 473         if ( $match->{ 'SCORE' } > 0 )
 474         {
 475             if ( $match->{ 'SCORE' } > $best_match->{ 'SCORE' } ) {
 476                 $best_match = $match;
 477             } else {
 478                 push @new_matches, $match;
 479             }
 480         }
 481     }
 482
 483     unshift @new_matches, $best_match if $best_match->{ 'SCORE' } > 0;
 484
 485     return wantarray ? @new_matches : \@new_matches;
 486 }
 487
 488
 489 sub match_score
 490 {
 491     # Martin A. Hansen, August 2009
 492
 493     # Given a match and a search space scores the match according to three criteria:
 494
 495     # 1) length of match - the longer the better.
 496     # 2) distance to closest corner - the shorter the better.
 497     # 3) distance to closest narrow end of the search space - the shorter the better.
 498
 499     # Each of these scores are divided by search space dimensions, and the total score
 500     # is calculated: total = score_len - score_corner - score_narrow.
 501
 502     # The higher the score, the better the match.
 503
 504     my ( $match,   # match
 505          $space,   # search space
 506        ) = @_;
 507
 508     # Returns nothing.
 509
 510     my ( $score_len, $score_corner, $score_diag, $score_narrow, $score_total );
 511
 512     $score_len    = match_score_len( $match );
 513     $score_corner = match_score_corner( $match, $space );
 514     $score_diag   = match_score_diag( $match, $space );
 515     $score_narrow = 0; #match_score_narrow( $match, $space );
 516
 517     $score_total  = $score_len - $score_corner - $score_diag - $score_narrow;
 518
 519     # print STDERR "LEN: $score_len   CORNER: $score_corner   DIAG: $score_diag   NARROW: $score_narrow: TOTAL: $score_total\n" if $score_total > 0;
 520     print STDERR "LEN: $score_len   CORNER: $score_corner   DIAG: $score_diag   NARROW: $score_narrow: TOTAL: $score_total\n";
 521
 522     $match->{ 'SCORE' } = $score_total;
 523 }
 524
 525
 526 sub match_score_len
 527 {
 528     # Martin A. Hansen, August 2009
 529
 530     # Score according to match length.
 531
 532     my ( $match,   # match
 533        ) = @_;
 534
 535     # Returns integer.
 536
 537     return $match->{ 'LEN' } * 2;
 538 }
 539
 540
 541 sub match_score_corner
 542 {
 543     # Martin A. Hansen, August 2009
 544
 545     # Score according to distance from corner.
 546
 547     my ( $match,   # match
 548          $space,   # search space
 549        ) = @_;
 550
 551     # Returns integer.
 552
 553     my ( $left_dist, $right_dist, $score_corner );
 554
 555     $left_dist  = Maasha::Calc::dist_point2point( $space->{ 'Q_MIN' }, $space->{ 'S_MIN' }, $match->{ 'Q_BEG' } , $match->{ 'S_BEG' } );
 556     $right_dist = Maasha::Calc::dist_point2point( $space->{ 'Q_MAX' }, $space->{ 'S_MAX' }, $match->{ 'Q_END' } , $match->{ 'S_END' } );
 557
 558     $score_corner = Maasha::Calc::min( $left_dist, $right_dist );
 559
 560     return int $score_corner * 1.5;
 561 }
 562
 563
 564 sub match_score_diag
 565 {
 566     # Martin A. Hansen, August 2009
 567
 568     # Score according to distance from diagonal.
 569
 570     my ( $match,   # match
 571          $space,   # search space
 572        ) = @_;
 573
 574     # Returns integer.
 575
 576     my ( $q_dim, $s_dim, $beg_diag_dist, $end_diag_dist, $min_diag_dist, $score_diag );
 577
 578     $q_dim = $space->{ 'Q_MAX' } - $space->{ 'Q_MIN' } + 1;
 579     $s_dim = $space->{ 'S_MAX' } - $space->{ 'S_MIN' } + 1;
 580
 581     if ( $q_dim >= $s_dim ) # s_dim is the narrow end
 582     {
 583         $beg_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' },
 584                                                         $match->{ 'S_BEG' },
 585                                                         $space->{ 'Q_MIN' },
 586                                                         $space->{ 'S_MIN' },
 587                                                         $space->{ 'Q_MIN' } + $s_dim,
 588                                                         $space->{ 'S_MIN' } + $s_dim );
 589
 590         $end_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' },
 591                                                         $match->{ 'S_BEG' },
 592                                                         $space->{ 'Q_MAX' } - $s_dim,
 593                                                         $space->{ 'S_MAX' } - $s_dim,
 594                                                         $space->{ 'Q_MAX' },
 595                                                         $space->{ 'S_MAX' } );
 596     }
 597     else
 598     {
 599         $beg_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' },
 600                                                         $match->{ 'S_BEG' },
 601                                                         $space->{ 'Q_MIN' },
 602                                                         $space->{ 'S_MIN' },
 603                                                         $space->{ 'Q_MIN' } + $q_dim,
 604                                                         $space->{ 'S_MIN' } + $q_dim );
 605
 606         $end_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' },
 607                                                         $match->{ 'S_BEG' },
 608                                                         $space->{ 'Q_MAX' } - $q_dim,
 609                                                         $space->{ 'S_MAX' } - $q_dim,
 610                                                         $space->{ 'Q_MAX' },
 611                                                         $space->{ 'S_MAX' } );
 612     }
 613
 614     $min_diag_dist = Maasha::Calc::min( $beg_diag_dist, $end_diag_dist );
 615
 616     $score_diag = $min_diag_dist;
 617
 618     return int $score_diag;
 619 }
 620
 621
 622 sub match_score_narrow
 623 {
 624     # Martin A. Hansen, August 2009
 625
 626     # Score according to distance to the narrow end of the search space.
 627
 628     my ( $match,   # match
 629          $space,   # search space
 630        ) = @_;
 631
 632     # Returns integer.
 633
 634     my ( $max_narrow_dist, $q_dim, $s_dim, $beg_narrow_dist, $end_narrow_dist, $score_narrow );
 635
 636     $max_narrow_dist = 0;
 637
 638     $q_dim = $space->{ 'Q_MAX' } - $space->{ 'Q_MIN' } + 1;
 639     $s_dim = $space->{ 'S_MAX' } - $space->{ 'S_MIN' } + 1;
 640
 641     if ( $q_dim > $s_dim ) # s_dim is the narrow end
 642     {
 643         $beg_narrow_dist = $match->{ 'Q_BEG' } - $space->{ 'Q_MIN' };
 644         $end_narrow_dist = $space->{ 'Q_MAX' } - $match->{ 'Q_BEG' };
 645
 646         $max_narrow_dist = Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist );
 647     }
 648     elsif ( $q_dim < $s_dim ) # q_dim is the narrow end
 649     {
 650         $beg_narrow_dist = $match->{ 'S_BEG' } - $space->{ 'S_MIN' };
 651         $end_narrow_dist = $space->{ 'S_MAX' } - $match->{ 'S_BEG' };
 652
 653         $max_narrow_dist = Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist );
 654     }
 655
 656     $score_narrow = $max_narrow_dist;
 657
 658     return abs( $beg_narrow_dist - $end_narrow_dist );
 659
 660     return $score_narrow;
 661 }
 662
 663
 664 sub shift_case
 665 {
 666     # Martin A. Hansen, August 2009.
 667
 668     # Given a list of matches and sequence references
 669     # uppercase only matching sequnce while lowercasing the rest.
 670
 671     my ( $matches,   # list of matches
 672          $q_seq,     # query sequence ref
 673          $s_seq,     # subject sequence ref
 674        ) = @_;
 675
 676     # Returns nothing.
 677
 678     my ( $match );
 679
 680     ${ $q_seq } = lc ${ $q_seq };
 681     ${ $s_seq } = lc ${ $s_seq };
 682
 683     foreach $match ( @{ $matches } )
 684     {
 685         substr ${ $q_seq }, $match->{ 'Q_BEG' }, $match->{ 'LEN' }, uc substr ${ $q_seq }, $match->{ 'Q_BEG' }, $match->{ 'LEN' };
 686         substr ${ $s_seq }, $match->{ 'S_BEG' }, $match->{ 'LEN' }, uc substr ${ $s_seq }, $match->{ 'S_BEG' }, $match->{ 'LEN' };
 687     }
 688 }
 689
 690
 691
 692 sub insert_indels
 693 {
 694     # Martin A. Hansen, August 2009.
 695
 696     # Given a list of matches and sequence references
 697     # insert indels to form aligned sequences.
 698
 699     my ( $matches,   # list of matches
 700          $q_seq,     # query sequence ref
 701          $s_seq,     # subject sequence ref
 702        ) = @_;
 703
 704     my ( $i, $q_dist, $s_dist, $q_indels, $s_indels, $diff );
 705
 706     $q_indels = 0;
 707     $s_indels = 0;
 708
 709     if ( @{ $matches } > 0 )
 710     {
 711         @{ $matches } = sort { $a->{ 'Q_BEG' } <=> $b->{ 'Q_BEG' } } @{ $matches };  # FIXME is this necessary?
 712
 713         # >>>>>>>>>>>>>> FIRST MATCH <<<<<<<<<<<<<<
 714
 715         $diff = $matches->[ 0 ]->{ 'Q_BEG' } - $matches->[ 0 ]->{ 'S_BEG' };
 716
 717         if ( $diff > 0 )
 718         {
 719             substr ${ $s_seq }, 0, 0, '-' x $diff;
 720
 721             $s_indels += $diff;
 722         }
 723         elsif ( $diff < 0 )
 724         {
 725             substr ${ $q_seq }, 0, 0, '-' x abs $diff;
 726
 727             $q_indels += abs $diff;
 728         }
 729
 730         # >>>>>>>>>>>>>> MIDDLE MATCHES <<<<<<<<<<<<<<
 731
 732         for ( $i = 0; $i < scalar @{ $matches } - 1; $i++ )
 733         {
 734             $q_dist = $matches->[ $i + 1 ]->{ 'Q_BEG' } - $matches->[ $i ]->{ 'Q_END' };
 735             $s_dist = $matches->[ $i + 1 ]->{ 'S_BEG' } - $matches->[ $i ]->{ 'S_END' };
 736
 737             $diff   = $q_dist - $s_dist;
 738
 739             if ( $diff > 0 )
 740             {
 741                 substr ${ $s_seq }, $s_indels + $matches->[ $i ]->{ 'S_END' } + 1, 0, '-' x $diff;
 742
 743                 $s_indels += $diff;
 744             }
 745             elsif ( $diff < 0 )
 746             {
 747                 substr ${ $q_seq }, $q_indels + $matches->[ $i ]->{ 'Q_END' } + 1, 0, '-' x abs $diff;
 748
 749                 $q_indels += abs $diff;
 750             }
 751         }
 752     }
 753
 754     # >>>>>>>>>>>>>> LAST MATCH <<<<<<<<<<<<<<
 755
 756     $diff = length( ${ $q_seq } ) - length( ${ $s_seq } );
 757
 758     if ( $diff > 0 )
 759     {
 760         ${ $s_seq } .= '-' x $diff;
 761
 762         $s_indels += $diff;
 763     }
 764     else
 765     {
 766         ${ $q_seq } .= '-' x abs $diff;
 767
 768         $q_indels += abs $diff;
 769     }
 770 }
 771
 772
 773 __END__
 774
 775
 776 sub align_sim_local
 777 {
 778     # Martin A. Hansen, May 2007.
 779
 780     # Calculate the local similarity of an alignment based on
 781     # an alignment chain. The similarity is calculated as
 782     # the number of matching residues divided by the overall
 783     # length of the alignment chain. This means that a short
 784     # but "good" alignment will yield a high similarity, while
 785     # a long "poor" alignment will yeild a low similarity.
 786
 787     my ( $chain,   # list of matches in alignment
 788        ) = @_;
 789
 790     # returns a float
 791
 792     my ( $match, $match_tot, $q_beg, $q_end, $s_beg, $s_end, $q_diff, $s_diff, $max, $sim );
 793
 794     return 0 if not @{ $chain };
 795
 796     $match_tot = 0;
 797     $q_end     = 0;
 798     $s_end     = 0;
 799     $q_beg     = 999999999;
 800     $s_beg     = 999999999;
 801
 802     foreach $match ( @{ $chain } )
 803     {
 804         $match_tot += $match->[ LEN ];
 805
 806         $q_beg = $match->[ Q_BEG ] if $match->[ Q_BEG ] < $q_beg;
 807         $s_beg = $match->[ S_BEG ] if $match->[ S_BEG ] < $s_beg;
 808
 809         $q_end = $match->[ Q_END ] if $match->[ Q_END ] > $q_end;
 810         $s_end = $match->[ S_END ] if $match->[ S_END ] > $s_end;
 811     }
 812
 813     $q_diff = $q_end - $q_beg + 1;
 814     $s_diff = $s_end - $s_beg + 1;
 815
 816     $max = Maasha::Calc::max( $q_diff, $s_diff );
 817
 818     $sim = sprintf( "%.2f", ( $match_tot / $max ) * 100 );
 819
 820     return $sim;
 821 }
 822
 823
 824 sub align_sim_global
 825 {
 826     # Martin A. Hansen, June 2007.
 827
 828     # Calculate the global similarity of an alignment based on
 829     # an alignment chain. The similarity is calculated as
 830     # the number of matching residues divided by the
 831     # length of the shortest sequence.
 832
 833     my ( $chain,   # list of matches in alignment
 834          $q_seq,   # ref to query sequence
 835          $s_seq,   # ref to subject sequence
 836        ) = @_;
 837
 838     # returns a float
 839
 840     my ( $match_tot, $min, $sim );
 841
 842     return 0 if not @{ $chain };
 843
 844     $match_tot = 0;
 845
 846     map { $match_tot += $_->[ LEN ] } @{ $chain };
 847
 848     $min = Maasha::Calc::min( length( ${ $q_seq } ), length( ${ $s_seq } ) );
 849
 850     $sim = sprintf( "%.2f", ( $match_tot / $min ) * 100 );
 851
 852     return $sim;
 853 }
 854
 855
 856 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<