code_perl/Maasha/AlignTwoSeq.pm

   1 package Align;
   2
   3 # Copyright (C) 2007 Martin A. Hansen.
   4
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; either version 2
   8 # of the License, or (at your option) any later version.
   9
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18
  19 # http://www.gnu.org/copyleft/gpl.html
  20
  21
  22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  23
  24
  25 # yak yak yak
  26
  27
  28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  29
  30
  31 use strict;
  32 use Data::Dumper;
  33 use Storable qw( dclone );
  34 use IPC::Open2;
  35 use Maasha::Calc;
  36 use Maasha::Seq;
  37 use vars qw ( @ISA @EXPORT );
  38
  39 use constant {
  40     Q_BEG => 0,
  41     Q_END => 1,
  42     S_BEG => 2,
  43     S_END => 3,
  44     LEN   => 4,
  45     SCORE => 5,
  46     HEAD  => 0,
  47     SEQ   => 1,
  48 };
  49
  50 @ISA = qw( Exporter );
  51
  52
  53 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  54
  55
  56 sub align_two_seq
  57 {
  58     # Martin A. Hansen, August 2006.
  59
  60     # Generates an alignment by chaining matches, which are subsequences
  61     # shared between two sequences. The routine functions by considering
  62     # only matches within a given search space. If no matches are given
  63     # these will be generated, if long matches are found these will be
  64     # included in the alignment, otherwise matches will be included depending
  65     # on a calculated score. New search spaces spanning the spaces between
  66     # matches and the search space boundaries will be cast and recursed into.
  67
  68     my ( $q_seq,     # sequence 1 ref
  69          $s_seq,     # sequence 2 ref
  70          $matches,   # list of matches
  71          $q_min,     # q sequence start position
  72          $q_max,     # q sequecne stop  position
  73          $s_min,     # s sequence start position
  74          $s_max,     # s sequecne stop  position
  75          $args,      # argument hash
  76        ) = @_;
  77
  78     # returns a chain of matches that can be chained into an alignment
  79
  80     $matches ||= [];
  81     $q_min   ||= 0;
  82     $s_min   ||= 0;
  83     $q_max   ||= length( ${ $q_seq } ) - 1;
  84     $s_max   ||= length( ${ $s_seq } ) - 1;
  85
  86     $args->{ "long_matches" }  ||= 50;
  87     $args->{ "alph_len" }      ||= 4;
  88
  89     my ( $wordsize, @chain, $match, $best_match, @long_matches );
  90
  91     $matches = select_matches( $matches, $q_min, $q_max, $s_min, $s_max );
  92
  93     if ( scalar @{ $matches } == 0 )   # no matches - find some!
  94     {
  95         # $wordsize = find_wordsize( $q_min, $q_max, $s_min, $s_max, $args );
  96         $wordsize = 4;
  97         $matches  = find_matches( $q_seq, $s_seq, $wordsize, $q_min, $q_max, $s_min, $s_max );
  98
  99         while ( scalar @{ $matches } == 0 and $wordsize > 1 )
 100         {
 101             $wordsize--;
 102             $matches = find_matches( $q_seq, $s_seq, $wordsize, $q_min, $q_max, $s_min, $s_max );
 103         }
 104
 105         if ( scalar @{ $matches } > 0 ) {
 106             push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $q_max, $s_min, $s_max, $args );
 107         }
 108     }
 109     elsif ( @long_matches = grep { $_->[ LEN ] >= $args->{ "long_matches" } } @{ $matches } )   # long matches found - include all that don't overlap!
 110     {
 111         @long_matches = order_matches( \@long_matches );
 112
 113         foreach $match ( @long_matches )
 114         {
 115             push @chain, $match;
 116
 117             if ( $match->[ Q_BEG ] - $q_min >= 2 and $match->[ S_BEG ] - $s_min >= 2 ) {
 118                 push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $match->[ Q_BEG ] - 1, $s_min, $match->[ S_BEG ] - 1, $args );   # intermediate search space
 119             }
 120
 121             $q_min = $match->[ Q_END ] + 1;
 122             $s_min = $match->[ S_END ] + 1;
 123         }
 124
 125         if ( $q_min + 1 < $q_max and $s_min + 1 < $s_max ) {
 126             push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $q_max, $s_min, $s_max, $args );   # remaining search space
 127         }
 128     }
 129     else   # shorter matches are included according to score
 130     {
 131         foreach $match ( @{ $matches } ) {
 132             # $match->[ SCORE ] = score_match( $match, $q_min, $q_max, $s_min, $s_max );
 133             $match->[ SCORE ] = score_match_niels( $match, $q_seq, $q_min, $q_max, $s_min, $s_max );
 134         }
 135
 136         # @{ $matches } = grep { $_->[ SCORE ] > 0 } @{ $matches };
 137         @{ $matches } = grep { $_->[ SCORE ] <= 0.25 } @{ $matches };
 138         # @{ $matches } = sort { $b->[ SCORE ] <=> $a->[ SCORE ] } @{ $matches };
 139         @{ $matches } = sort { $a->[ SCORE ] <=> $b->[ SCORE ] } @{ $matches };
 140
 141         $best_match   = shift @{ $matches };
 142
 143         if ( $best_match )
 144         {
 145             push @chain, $best_match;
 146
 147             if ( $best_match->[ Q_BEG ] - $q_min >= 2 and $best_match->[ S_BEG ] - $s_min >= 2 ) {
 148                 push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $best_match->[ Q_BEG ] - 1, $s_min, $best_match->[ S_BEG ] - 1, $args );   # left search space
 149             }
 150
 151             if ( $q_max - $best_match->[ Q_END ] >= 2 and $s_max - $best_match->[ S_END ] >= 2 ) {
 152                 push @chain, align_two_seq( $q_seq, $s_seq, $matches, $best_match->[ Q_END ] + 1, $q_max, $best_match->[ S_END ] + 1, $s_max, $args );   # right search space
 153             }
 154         }
 155     }
 156
 157     return wantarray ? @chain : \@chain;
 158 }
 159
 160
 161 sub select_matches
 162 {
 163     # Martin A. Hansen, August 2006.
 164
 165     # Given a list of matches and a search space,
 166     # include only those matches contained within
 167     # this search space.
 168
 169     my ( $matches,   # list of matches
 170          $q_min,     # q sequence start position
 171          $q_max,     # q sequecne stop  position
 172          $s_min,     # s sequence start position
 173          $s_max,     # s sequecne stop  position
 174        ) = @_;
 175
 176     # returns list of matches
 177
 178     my ( @matches );
 179
 180     @matches = grep { $_->[ Q_BEG ] >= $q_min and
 181                       $_->[ S_BEG ] >= $s_min and
 182                       $_->[ Q_END ] <= $q_max and
 183                       $_->[ S_END ] <= $s_max } @{ $matches };
 184
 185     return wantarray ? @matches : \@matches;
 186 }
 187
 188
 189 sub order_matches
 190 {
 191     # Martin A. Hansen, October 2006
 192
 193     # given a list of long matches, order these by length and position
 194     # and include only those long matches that does not cross.
 195
 196     my ( $long_matches,   # list of matches
 197        ) = @_;
 198
 199     # returns a list of matches
 200
 201     my ( @matches, $match, $i );
 202
 203     @{ $long_matches } = sort { $b->[ LEN ] <=> $a->[ LEN ] } @{ $long_matches };
 204
 205     @matches = shift @{ $long_matches };
 206
 207     foreach $match ( @{ $long_matches } )
 208     {
 209         if ( $match->[ Q_END ] < $matches[ 0 ]->[ Q_BEG ] and $match->[ S_END ] < $matches[ 0 ]->[ S_BEG ] )
 210         {
 211             unshift @matches, $match;
 212         }
 213         elsif ( $match->[ Q_BEG ] > $matches[ -1 ]->[ Q_END ] and $match->[ S_BEG ] > $matches[ -1 ]->[ S_END ] )
 214         {
 215             push @matches, $match;
 216         }
 217         else
 218         {
 219             for ( $i = 1; $i < @matches; $i++ )
 220             {
 221                 if ( $matches[ $i - 1 ]->[ Q_END ] < $match->[ Q_BEG ] and $match->[ Q_END ] < $matches[ $i ]->[ Q_BEG ] and
 222                      $matches[ $i - 1 ]->[ S_END ] < $match->[ S_BEG ] and $match->[ S_END ] < $matches[ $i ]->[ S_BEG ]
 223                 )
 224                 {
 225                     splice @matches, $i, 0, dclone $match;
 226                     last;
 227                 }
 228             }
 229         }
 230     }
 231
 232     return wantarray ? @matches : \@matches;
 233 }
 234
 235
 236 sub find_wordsize
 237 {
 238     # Martin A. Hansen, August 2006.
 239
 240     # Given a search space calculates the wordsize for a word so a match
 241     # occurs only a few times. More matches may be needed at low similarity in
 242     # order to avoid starting with a wrong match.
 243
 244     my ( $q_min,   # q sequence start position
 245          $q_max,   # q sequecne stop position
 246          $s_min,   # s sequence start position
 247          $s_max,   # s sequecne stop position
 248          $args,    # argument hash
 249        ) = @_;
 250
 251     # returns integer
 252
 253     my ( $q_dim, $s_dim, $dim_min, $wordsize );
 254
 255     $q_dim = $q_max - $q_min + 1;
 256     $s_dim = $s_max - $s_min + 1;
 257
 258     $dim_min = Maasha::Calc::min( $q_dim, $s_dim );
 259
 260     $wordsize = 1;
 261
 262     if ( $dim_min > 2000000 )                # optimized for DNA
 263     {
 264         $wordsize = $args->{ "long_matches" };
 265     }
 266     elsif ( $dim_min > 100000 )                # optimized for DNA
 267     {
 268         $wordsize = int( $args->{ "long_matches" } / 2 );
 269     }
 270     elsif ( $q_dim > 100 or $s_dim > 100 )  # optimized for DNA
 271     {
 272         while ( $args->{ "alph_len" } ** $wordsize <= $q_dim * $s_dim and $wordsize < $dim_min ) {
 273             $wordsize++;
 274         }
 275     }
 276     else
 277     {
 278         while ( $args->{ "alph_len" } ** $wordsize <= $dim_min and $wordsize < $dim_min ) {
 279             $wordsize++;
 280         }
 281     }
 282
 283     return $wordsize;
 284 }
 285
 286
 287 sub find_matches
 288 {
 289     # Martin A. Hansen, November 2006
 290
 291     # given two sequences, find all maximum expanded matches between these
 292
 293     my ( $q_seq,      # sequence 1
 294          $s_seq,      # sequence 2
 295          $wordsize,   # word size
 296          $q_min,      # q sequence start position
 297          $q_max,      # q sequecne stop  position
 298          $s_min,      # s sequence start position
 299          $s_max,      # s sequecne stop  position
 300        ) = @_;
 301
 302     # returns list of matches
 303
 304     my ( $q_beg, $q_word, %word_hash, $s_beg, $s_word, $match, @matches );
 305
 306     if ( length ${ $s_seq } > length ${ $q_seq } )
 307     {
 308         for ( $q_beg = $q_min; $q_beg <= $q_max - $wordsize + 1; $q_beg++ )
 309         {
 310             $q_word = lc substr ${ $q_seq }, $q_beg, $wordsize;
 311
 312             next if $q_word =~ /n/i; #   DNA/genome optimization
 313
 314             push @{ $word_hash{ $q_word } }, $q_beg;
 315         }
 316
 317         for ( $s_beg = $s_min; $s_beg <= $s_max - $wordsize + 1; $s_beg++ )
 318         {
 319             $s_word = lc substr ${ $s_seq }, $s_beg, $wordsize;
 320
 321             if ( exists $word_hash{ $s_word } )
 322             {
 323                 foreach $q_beg ( @{ $word_hash{ $s_word } } )
 324                 {
 325                     $match = [ $q_beg, $q_beg + $wordsize - 1, $s_beg, $s_beg + $wordsize - 1 ];
 326
 327                     if ( grep { $match->[ Q_BEG ] >= $_->[ Q_BEG ] and
 328                                 $match->[ Q_END ] <= $_->[ Q_END ] and
 329                                 $match->[ S_BEG ] >= $_->[ S_BEG ] and
 330                                 $match->[ S_END ] <= $_->[ S_END ] } @matches )
 331                     {
 332                         next;   # match is redundant
 333                     }
 334                     else
 335                     {
 336                         $match = expand_match( $q_seq, $s_seq, $match, $q_max, $q_min, $s_max, $s_min );
 337                         $match->[ LEN ] = $match->[ Q_END ] - $match->[ Q_BEG ] + 1;
 338
 339                         push @matches, $match;
 340                     }
 341                 }
 342             }
 343         }
 344     }
 345     else
 346     {
 347         for ( $s_beg = $s_min; $s_beg <= $s_max - $wordsize + 1; $s_beg++ )
 348         {
 349             $s_word = lc substr ${ $s_seq }, $s_beg, $wordsize;
 350
 351             next if $s_word =~ /n/i; #   DNA/genome optimization
 352
 353             push @{ $word_hash{ $s_word } }, $s_beg;
 354         }
 355
 356         for ( $q_beg = $q_min; $q_beg <= $q_max - $wordsize + 1; $q_beg++ )
 357         {
 358             $q_word = lc substr ${ $q_seq }, $q_beg, $wordsize;
 359
 360             if ( exists $word_hash{ $q_word } )
 361             {
 362                 foreach $s_beg ( @{ $word_hash{ $q_word } } )
 363                 {
 364                     $match = [ $q_beg, $q_beg + $wordsize - 1, $s_beg, $s_beg + $wordsize - 1 ];
 365
 366                     if ( grep { $match->[ Q_BEG ] >= $_->[ Q_BEG ] and
 367                                 $match->[ Q_END ] <= $_->[ Q_END ] and
 368                                 $match->[ S_BEG ] >= $_->[ S_BEG ] and
 369                                 $match->[ S_END ] <= $_->[ S_END ] } @matches )
 370                     {
 371                         next;   # match is redundant
 372                     }
 373                     else
 374                     {
 375                         $match = expand_match( $q_seq, $s_seq, $match, $q_max, $q_min, $s_max, $s_min );
 376                         $match->[ LEN ] = $match->[ Q_END ] - $match->[ Q_BEG ] + 1;
 377
 378                         push @matches, $match;
 379                     }
 380                 }
 381             }
 382         }
 383     }
 384
 385     return wantarray ? @matches : \@matches;
 386 }
 387
 388
 389 sub expand_match
 390 {
 391     # Martin A. Hansen, August 2006.
 392
 393     # Given two sequences and a match, expand the match maximally.
 394     # A match is defined like this: [ Q_BEG, Q_END, S_BEG, S_END ]
 395
 396     my ( $q_seq,   # sequence 1 ref
 397          $s_seq,   # sequence 2 ref
 398          $match,   # sequence match
 399          $q_max,   # q sequecne stop position
 400          $q_min,   # q sequence start position
 401          $s_max,   # s sequecne stop position
 402          $s_min,   # s sequence start position
 403        ) = @_;
 404
 405     # returns match
 406
 407     my ( $q_pos, $s_pos );
 408
 409     # expanding forward
 410
 411     $q_pos = $match->[ Q_END ] + 1;
 412     $s_pos = $match->[ S_END ] + 1;
 413
 414     while ( $q_pos <= $q_max and $s_pos <= $s_max and lc substr( ${ $q_seq }, $q_pos, 1 ) eq lc substr( ${ $s_seq }, $s_pos, 1 ) )
 415     {
 416         $q_pos++;
 417         $s_pos++;
 418     }
 419
 420     $match->[ Q_END ] = $q_pos - 1;
 421     $match->[ S_END ] = $s_pos - 1;
 422
 423     # expanding backwards
 424
 425     $q_pos = $match->[ Q_BEG ] - 1;
 426     $s_pos = $match->[ S_BEG ] - 1;
 427
 428     while ( $q_pos >= $q_min and $s_pos >= $s_min and lc substr( ${ $q_seq }, $q_pos, 1 ) eq lc substr( ${ $s_seq }, $s_pos, 1 ) )
 429     {
 430         $q_pos--;
 431         $s_pos--;
 432     }
 433
 434     $match->[ Q_BEG ] = $q_pos + 1;
 435     $match->[ S_BEG ] = $s_pos + 1;
 436
 437     return $match;
 438 }
 439
 440
 441 sub score_match
 442 {
 443     # Martin A. Hansen, August 2006
 444
 445     # given a match and a search space scores the match according to three criteria:
 446
 447     # 1) length of match - the longer the better.
 448     # 2) distance to closest corner - the shorter the better.
 449     # 3) distance to closest narrow end of the search space - the shorter the better.
 450
 451     # each of these scores are divided by search space dimentions, and the total score
 452     # is calculated: total = score_len - score_corner - score_narrow
 453
 454     # the higher the score, the better the match.
 455
 456     my ( $match,   # match
 457          $q_min,   # q sequence start position
 458          $q_max,   # q sequecne stop  position
 459          $s_min,   # s sequence start position
 460          $s_max,   # s sequecne stop  position
 461        ) = @_;
 462
 463     # returns a positive number
 464
 465     my ( $q_dim, $s_dim, $score_len, $score_corner, $score_narrow, $score_total, $beg_diag_dist, $end_diag_dist,
 466          $min_diag_dist, $score_diag, $beg_narrow_dist, $end_narrow_dist, $max_narrow_dist );
 467
 468     # ----- 1) scoring according to match length
 469
 470     $score_len = $match->[ LEN ] ** 3;
 471
 472     # ---- 2) score according to distance away from diagonal
 473
 474     $q_dim = $q_max - $q_min + 1;
 475     $s_dim = $s_max - $s_min + 1;
 476
 477     if ( $q_dim >= $s_dim ) # s_dim is the narrow end
 478     {
 479         $beg_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_min, $s_min, $q_min + $s_dim, $s_min + $s_dim );
 480         $end_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_max - $s_dim, $s_max - $s_dim, $q_max, $s_max );
 481     }
 482     else
 483     {
 484         $beg_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_min, $s_min, $q_min + $q_dim, $s_min + $q_dim );
 485         $end_diag_dist = Maasha::Calc::dist_point2line( $match->[ Q_BEG ], $match->[ S_BEG ], $q_max - $q_dim, $s_max - $q_dim, $q_max, $s_max );
 486     }
 487
 488     $min_diag_dist = Maasha::Calc::min( $beg_diag_dist, $end_diag_dist );
 489
 490     $score_diag = 2 * $min_diag_dist ** 2;
 491
 492     # ----- 3) scoring according to distance to the narrow end of the search space
 493
 494     if ( $q_dim > $s_dim ) # s_dim is the narrow end
 495     {
 496         $beg_narrow_dist = $match->[ Q_BEG ] - $q_min;
 497         $end_narrow_dist = $q_max - $match->[ Q_BEG ];
 498
 499         $max_narrow_dist = Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist );
 500     }
 501     elsif ( $q_dim < $s_dim )
 502     {
 503         $beg_narrow_dist = $match->[ S_BEG ] - $s_min;
 504         $end_narrow_dist = $s_max - $match->[ S_BEG ];
 505
 506         $max_narrow_dist = Maasha::Calc::max( $beg_narrow_dist, $end_narrow_dist );
 507     }
 508     else
 509     {
 510         $max_narrow_dist = 0;
 511     }
 512
 513     $score_narrow = $max_narrow_dist;
 514
 515     $score_total = $score_len - $score_narrow - $score_diag;
 516   #  $score_total = -1 if 3 * $min_diag_dist > $match->[ LEN ];
 517
 518     return $score_total;
 519 }
 520
 521
 522 sub score_match_niels
 523 {
 524     # Niels Larsen, June 2004.
 525
 526     # Creates a crude "heuristic" attempt of telling how likely it is that a
 527     # given match occurs by chance in a given search space. If sequences are
 528     # given their composition is taken into account. The scoring punishes
 529     # distance from diagonal(s) and distance from previous match(es). Scores
 530     # range from zero and up, and lower is better.
 531
 532     my ( $match,   # Match array
 533          $q_seq,   # Either q_seq or s_seq
 534          $q_min,   # Lower bound search area (query sequence)
 535          $q_max,   # Upper bound search area (query sequence)
 536          $s_min,   # Lower bound search area (subject sequence)
 537          $s_max,   # Upper bound search area (subject sequence)
 538          ) = @_;
 539
 540     # Returns a positive number.
 541
 542     my ( $q_beg, $s_beg, $q_end, $s_end, $q_dim, $s_dim, $seq, $pos,
 543          $q_delta_beg, $s_delta_beg, $q_delta_end, $s_delta_end, $i,
 544          $delta_beg_max, $delta_end_max, $as, $gs, $ts, $cs, $pmatch,
 545          $score, $moves, $dist_beg, $dist_end, $seqlen, %chars, $delta,
 546          $delta_max );
 547
 548     $q_beg = $match->[Q_BEG];
 549     $q_end = $match->[Q_END];
 550     $s_beg = $match->[S_BEG];
 551     $s_end = $match->[S_END];
 552
 553     # >>>>>>>>>>>>>>>>>>>>>>> CRUDE INITIAL SCORE <<<<<<<<<<<<<<<<<<<<<<
 554
 555     # Get the probability of a match from the sequence composition (when
 556     # match is longer than 20 and sequence is given, otherwise use 0.25)
 557     # and raise that to the power of the length.
 558
 559     if ( $match->[LEN] > 20 and defined $q_seq )
 560     {
 561         $seq = substr ${ $q_seq }, $q_beg, $q_end-$q_beg+1;
 562         $seqlen = length $seq;
 563
 564         $chars{"a"} = $chars{"g"} = $chars{"c"} = $chars{"t"} = 0;
 565
 566         for ( $i = 0; $i < $seqlen; $i++ )
 567         {
 568             $chars{ substr $seq, $i, 1 }++;
 569         }
 570
 571         $pmatch = ($chars{"a"}/$seqlen)**2 + ($chars{"c"}/$seqlen)**2
 572                 + ($chars{"g"}/$seqlen)**2 + ($chars{"t"}/$seqlen)**2;
 573     }
 574     else {
 575         $pmatch = 0.25;
 576     }
 577
 578     $score = $pmatch ** ( $q_end - $q_beg + 1 );
 579
 580 #     # Punish by difference in height and width of search space,
 581
 582 #     $q_dim = $q_max - $q_min + 1;
 583 #     $s_dim = $s_max - $s_min + 1;
 584
 585 #     if ( $q_dim != $s_dim ) {
 586 #         $score *= abs ( $q_dim - $s_dim ) ** 2;
 587 #     }
 588
 589     return $score if $score > 0.25;
 590
 591     # Punish by how far the match is to the closest corner of the search
 592     # space,
 593
 594     $q_delta_beg = $q_beg - $q_min;
 595     $s_delta_beg = $s_beg - $s_min;
 596
 597     $q_delta_end = $q_max - $q_end;
 598     $s_delta_end = $s_max - $s_end;
 599
 600     if ( $q_delta_beg > $s_delta_beg ) {
 601         $delta_beg_max = $q_delta_beg;
 602     } else {
 603         $delta_beg_max = $s_delta_beg;
 604     }
 605
 606     if ( $q_delta_end > $s_delta_end ) {
 607         $delta_end_max = $q_delta_end;
 608     } else {
 609         $delta_end_max = $s_delta_end;
 610     }
 611
 612     if ( $delta_beg_max <= $delta_end_max ) {
 613         $score *= ($delta_beg_max+1) ** 2.0;
 614     } else {
 615         $score *= ($delta_end_max+1) ** 2.0;
 616     }
 617
 618     return $score if $score > 0.25;
 619
 620     # Add penalty if the match is towards the narrow end of the
 621     # search space,
 622
 623     if ( ($q_max - $q_min) <= ($s_max - $s_min) )
 624     {
 625         if ( $q_delta_beg > $s_delta_beg )
 626         {
 627             $score *= 2 * ( $q_delta_beg - $s_delta_beg ) ** 3;
 628         }
 629         elsif ( $q_delta_end > $s_delta_end )
 630         {
 631             $score *= 2 * ( $q_delta_end - $s_delta_end ) ** 3;
 632         }
 633     }
 634     else
 635     {
 636         if ( $s_delta_beg > $q_delta_beg )
 637         {
 638             $score *= 2 * ( $s_delta_beg - $q_delta_beg ) ** 3;
 639         }
 640         elsif ( $s_delta_end > $q_delta_end )
 641         {
 642             $score *= 2 * ( $s_delta_end - $q_delta_end ) ** 3;
 643         }
 644     }
 645
 646     if ( $score < 0 ) {
 647         print STDERR "q_min, q_max, s_min, s_max: $q_min, $q_max, $s_min, $s_max\n";
 648         die qq (Score <= 0 -> $score);
 649     }
 650
 651     return $score;
 652 }
 653
 654
 655 sub print_alignment
 656 {
 657     # Martin A. Hansen, August 2006.
 658
 659     # Routine to print an alignment in fasta format based
 660     # on a list of matches and two given sequences.
 661
 662     my ( $matches,  # list of matches
 663          $q_head,   # query   sequence head
 664          $q_seq,    # query   sequence ref
 665          $s_head,   # subject sequence head
 666          $s_seq,    # subject sequence ref
 667        ) = @_;
 668
 669     my ( $q_pos, $s_pos, $q_nomatch, $q_match, $s_nomatch, $match, $q_aseq, $s_aseq, $i );
 670
 671     @{ $matches } = sort { $a->[ Q_BEG ] <=> $b->[ Q_BEG ] } @{ $matches };
 672
 673     $q_pos = 0;
 674     $s_pos = 0;
 675
 676     for ( $i = 0; $i < @{ $matches }; $i++ )
 677     {
 678         $match = $matches->[ $i ];
 679
 680         $q_nomatch = $match->[ Q_BEG ] - $q_pos;
 681         $s_nomatch = $match->[ S_BEG ] - $s_pos;
 682
 683         if ( $q_nomatch - $s_nomatch > 0 )
 684         {
 685             $s_aseq .= "-" x ( $q_nomatch - $s_nomatch );
 686             $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ];
 687             $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ];
 688         }
 689         elsif ( $s_nomatch - $q_nomatch > 0 )
 690         {
 691             $q_aseq .= "-" x ( $s_nomatch - $q_nomatch );
 692             $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ];
 693             $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ];
 694         }
 695         else
 696         {
 697             $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ];
 698             $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ];
 699         }
 700
 701         $q_pos += $q_nomatch + $match->[ LEN ];
 702         $s_pos += $s_nomatch + $match->[ LEN ];
 703     }
 704
 705     $match = $matches->[ -1 ] || [ 0, 0, 0, 0, 0 ];
 706
 707     $q_nomatch = length( ${ $q_seq } ) - $match->[ Q_END ];
 708     $s_nomatch = length( ${ $s_seq } ) - $match->[ S_END ];
 709
 710     if ( $q_nomatch - $s_nomatch > 0 )
 711     {
 712         $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ];
 713         $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ];
 714         $s_aseq .= "-" x ( $q_nomatch - $s_nomatch );
 715     }
 716     elsif ( $s_nomatch - $q_nomatch > 0 )
 717     {
 718         $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ];
 719         $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ];
 720         $q_aseq .= "-" x ( $s_nomatch - $q_nomatch );
 721     }
 722     else
 723     {
 724         $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ];
 725         $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ];
 726     }
 727
 728     print ">$q_head\n$q_aseq\n>$s_head\n$s_aseq\n";
 729 }
 730
 731
 732 sub print_matches
 733 {
 734     # Martin A. Hansen, February 2007.
 735
 736     my ( $matches,  # list of matches
 737          $q_head,   # query   sequence head
 738          $q_seq,    # query   sequence ref
 739          $s_head,   # subject sequence head
 740          $s_seq,    # subject sequence ref
 741          $args,     # argument hash         - OPTIONAL
 742        ) = @_;
 743
 744     $args->{ "wrap" } ||= 80;
 745
 746     my ( $q_pos, $s_pos, $match, $q_nomatch, $q_match, $s_nomatch, $q_aseq, $s_aseq, $pins, $i, $q, $s, $q_ruler, $s_ruler, $entries );
 747
 748     @{ $matches } = sort { $a->[ Q_BEG ] <=> $b->[ Q_BEG ] } @{ $matches };
 749
 750     $q_pos = 0;
 751     $s_pos = 0;
 752
 753     for ( $i = 0; $i < @{ $matches }; $i++ )
 754     {
 755         $match = $matches->[ $i ];
 756
 757         $q_nomatch = $match->[ Q_BEG ] - $q_pos;
 758         $s_nomatch = $match->[ S_BEG ] - $s_pos;
 759
 760         $q = $q_pos;
 761         $s = $s_pos;
 762
 763         if ( $q_nomatch - $s_nomatch > 0 )
 764         {
 765             $q_aseq .= substr ${ $q_seq }, $q_pos, ( $q_nomatch - $s_nomatch );
 766             $s_aseq .= "-" x ( $q_nomatch - $s_nomatch );
 767             $pins   .= " " x ( $q_nomatch - $s_nomatch );
 768             $q += ( $q_nomatch - $s_nomatch );
 769         }
 770         elsif ( $s_nomatch - $q_nomatch > 0 )
 771         {
 772             $s_aseq .= substr ${ $s_seq }, $s_pos, ( $s_nomatch - $q_nomatch );
 773             $q_aseq .= "-" x ( $s_nomatch - $q_nomatch );
 774             $pins   .= " " x ( $s_nomatch - $q_nomatch );
 775             $s += ( $s_nomatch - $q_nomatch );
 776         }
 777
 778         while ( $q < $q_pos + $q_nomatch and $s < $s_pos + $s_nomatch )
 779         {
 780             $q_aseq .= substr ${ $q_seq }, $q, 1;
 781             $s_aseq .= substr ${ $s_seq }, $s, 1;
 782
 783             if ( substr( ${ $q_seq }, $q, 1 ) eq substr( ${ $s_seq }, $s, 1 ) )
 784             {
 785                 $pins .= ":";
 786             } else {
 787                 $pins .= " ";
 788             }
 789
 790             $q++;
 791             $s++;
 792         }
 793
 794         $q_aseq .= substr ${ $q_seq }, $match->[ Q_BEG ], $match->[ LEN ];
 795         $s_aseq .= substr ${ $s_seq }, $match->[ S_BEG ], $match->[ LEN ];
 796         $pins   .= "|" x $match->[ LEN ];
 797
 798         $q_pos += $q_nomatch + $match->[ LEN ];
 799         $s_pos += $s_nomatch + $match->[ LEN ];
 800     }
 801
 802     $q_nomatch = length( ${ $q_seq } ) - ( $match->[ Q_END ] || 0 );
 803     $s_nomatch = length( ${ $s_seq } ) - ( $match->[ S_END ] || 0 );
 804
 805     $q = $q_pos;
 806     $s = $s_pos;
 807
 808     while ( $q < $q_pos + $q_nomatch and $q < length ${ $q_seq } and $s < $s_pos + $s_nomatch and $s < length ${ $s_seq } )
 809     {
 810         $q_aseq .= substr ${ $q_seq }, $q, 1;
 811         $s_aseq .= substr ${ $s_seq }, $s, 1;
 812
 813         if ( substr( ${ $q_seq }, $q, 1 ) eq substr( ${ $s_seq }, $s, 1 ) ) {
 814             $pins .= ":";
 815         } else {
 816             $pins .= " ";
 817         }
 818
 819         $q++;
 820         $s++;
 821         $q_pos++;
 822         $s_pos++;
 823     }
 824
 825     if ( $q_nomatch - $s_nomatch > 0 )
 826     {
 827         $q_aseq .= substr ${ $q_seq }, $q_pos, ( $q_nomatch - $s_nomatch );
 828         $s_aseq .= "-" x ( $q_nomatch - $s_nomatch );
 829         $pins   .= " " x ( $q_nomatch - $s_nomatch );
 830     }
 831     elsif ( $s_nomatch - $q_nomatch > 0 )
 832     {
 833         $s_aseq .= substr ${ $s_seq }, $s_pos, ( $s_nomatch - $q_nomatch );
 834         $q_aseq .= "-" x ( $s_nomatch - $q_nomatch );
 835         $pins   .= " " x ( $s_nomatch - $q_nomatch );
 836     }
 837
 838     $q_ruler = make_ruler( $q_aseq );
 839     $s_ruler = make_ruler( $s_aseq );
 840
 841     $entries = [
 842         [ "ruler",     $q_ruler ],
 843         [ $q_head,     $q_aseq  ],
 844         [ "consensus", $pins    ],
 845         [ $s_head,     $s_aseq  ],
 846         [ "ruler",     $s_ruler ],
 847     ];
 848
 849     align_print_multi( $entries, undef, $args->{ "wrap" } )
 850 }
 851
 852
 853 sub make_ruler
 854 {
 855     # Martin A. Hansen, February 2007;
 856
 857     # Gererates a ruler for a given sequence (with indels).
 858
 859     my ( $seq
 860        ) = @_;
 861
 862     # Returns string
 863
 864     my ( $i, $char, $skip, $count, $gap, $tics );
 865
 866     $gap = 0;
 867     $i   = 1;
 868
 869     while ( $i <= length $seq )
 870     {
 871         $char = substr $seq, $i - 1, 1;
 872
 873         $gap++ if $char eq "-";
 874
 875         if ( $skip )
 876         {
 877             $skip--;
 878         }
 879         else
 880         {
 881             $count = $i - $gap;
 882             $count = 1 if $char eq "-";
 883
 884             if ( $count % 100 == 0 )
 885             {
 886                 if ( $count + length( $count ) >= length $seq )
 887                 {
 888                     $tics .= "|";
 889                 }
 890                 else
 891                 {
 892                     $tics .= "|" . $count;
 893                     $skip = length $count;
 894                 }
 895             }
 896             elsif ( $count % 50 == 0 ) {
 897                 $tics .= ":";
 898             } elsif ( $count % 10 == 0 ) {
 899                 $tics .= ".";
 900             } else {
 901                 $tics .= " ";
 902             }
 903         }
 904
 905         $i++;
 906     }
 907
 908     return $tics;
 909 }
 910
 911
 912 sub align_sim_local
 913 {
 914     # Martin A. Hansen, May 2007.
 915
 916     # Calculate the local similarity of an alignment based on
 917     # an alignment chain. The similarity is calculated as
 918     # the number of matching residues divided by the overall
 919     # length of the alignment chain. This means that a short
 920     # but "good" alignment will yield a high similarity, while
 921     # a long "poor" alignment will yeild a low similarity.
 922
 923     my ( $chain,   # list of matches in alignment
 924        ) = @_;
 925
 926     # returns a float
 927
 928     my ( $match, $match_tot, $q_beg, $q_end, $s_beg, $s_end, $q_diff, $s_diff, $max, $sim );
 929
 930     return 0 if not @{ $chain };
 931
 932     $match_tot = 0;
 933     $q_end     = 0;
 934     $s_end     = 0;
 935     $q_beg     = 999999999;
 936     $s_beg     = 999999999;
 937
 938     foreach $match ( @{ $chain } )
 939     {
 940         $match_tot += $match->[ LEN ];
 941
 942         $q_beg = $match->[ Q_BEG ] if $match->[ Q_BEG ] < $q_beg;
 943         $s_beg = $match->[ S_BEG ] if $match->[ S_BEG ] < $s_beg;
 944
 945         $q_end = $match->[ Q_END ] if $match->[ Q_END ] > $q_end;
 946         $s_end = $match->[ S_END ] if $match->[ S_END ] > $s_end;
 947     }
 948
 949     $q_diff = $q_end - $q_beg + 1;
 950     $s_diff = $s_end - $s_beg + 1;
 951
 952     $max = Maasha::Calc::max( $q_diff, $s_diff );
 953
 954     $sim = sprintf( "%.2f", ( $match_tot / $max ) * 100 );
 955
 956     return $sim;
 957 }
 958
 959
 960 sub align_sim_global
 961 {
 962     # Martin A. Hansen, June 2007.
 963
 964     # Calculate the global similarity of an alignment based on
 965     # an alignment chain. The similarity is calculated as
 966     # the number of matching residues divided by the
 967     # length of the shortest sequence.
 968
 969     my ( $chain,   # list of matches in alignment
 970          $q_seq,   # ref to query sequence
 971          $s_seq,   # ref to subject sequence
 972        ) = @_;
 973
 974     # returns a float
 975
 976     my ( $match_tot, $min, $sim );
 977
 978     return 0 if not @{ $chain };
 979
 980     $match_tot = 0;
 981
 982     map { $match_tot += $_->[ LEN ] } @{ $chain };
 983
 984     $min = Maasha::Calc::min( length( ${ $q_seq } ), length( ${ $s_seq } ) );
 985
 986     $sim = sprintf( "%.2f", ( $match_tot / $min ) * 100 );
 987
 988     return $sim;
 989 }
 990
 991
 992 sub align_consensus
 993 {
 994     # Martin A. Hansen, June 2006.
 995
 996     # Given an alignment as a list of FASTA entries,
 997     # generates a consensus sequences based on the
 998     # entropies for each column similar to the way
 999     # a sequence logo i calculated. Returns the
1000     # consensus sequence as a FASTA entry.
1001
1002     my ( $entries,   # list of aligned FASTA entries
1003          $type,      # residue type - OPTIONAL
1004          $min_sim,   # minimum similarity - OPTIONAL
1005        ) = @_;
1006
1007     # Returns tuple
1008
1009     my ( $bit_max, $data, $pos, $char, $score, $entry );
1010
1011     $type    ||= Maasha::Seq::seq_guess_type( $entries->[ 0 ] );
1012     $min_sim ||= 50;
1013
1014     if ( $type =~ /protein/ ) {
1015         $bit_max = 4;
1016     } else {
1017         $bit_max = 2;
1018     }
1019
1020     $data = Maasha::Seq::seqlogo_calc( $bit_max, $entries );
1021
1022     foreach $pos ( @{ $data } )
1023     {
1024         ( $char, $score ) = @{ $pos->[ -1 ] };
1025
1026         if ( ( $score / $bit_max ) * 100 >= $min_sim ) {
1027             $entry->[ SEQ ] .= $char;
1028         } else {
1029             $entry->[ SEQ ] .= "-";
1030         }
1031     }
1032
1033     $entry->[ HEAD ] = "Consensus: $min_sim%";
1034
1035     return wantarray ? @{ $entry } : $entry;
1036 }
1037
1038
1039 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<