added align_pair_seq

[biopieces.git] / code_perl / Maasha / AlignTwoSeq.pm
diff --git a/code_perl/Maasha/AlignTwoSeq.pm b/code_perl/Maasha/AlignTwoSeq.pm

index 3f861a8dcc1e6ee79c3470c2e662a1e16c82d7d6..00caff6430d1f15ff79d89bba25732e9663515e4 100644 (file)
--- a/code_perl/Maasha/AlignTwoSeq.pm
+++ b/code_perl/Maasha/AlignTwoSeq.pm
@@ -1,4 +1,4 @@
-package Align;
+package Maasha::AlignTwoSeq;
  
  # Copyright (C) 2007 Martin A. Hansen.
  
@@ -31,20 +31,20 @@ package Align;
  use strict;
  use Data::Dumper;
  use Storable qw( dclone );
-use IPC::Open2;
  use Maasha::Calc;
  use Maasha::Seq;
  use vars qw ( @ISA @EXPORT );
  
  use constant {
-    Q_BEG => 0,
-    Q_END => 1,
-    S_BEG => 2,
-    S_END => 3,
-    LEN   => 4,
-    SCORE => 5,
-    HEAD  => 0,
-    SEQ   => 1,
+    Q_BEG    => 0,
+    Q_END    => 1,
+    S_BEG    => 2,
+    S_END    => 3,
+    LEN      => 4,
+    SCORE    => 5,
+    HEAD     => 0,
+    SEQ      => 1,
+    ALPH_LEN => 4,
  };
  
  @ISA = qw( Exporter );
@@ -72,7 +72,6 @@ sub align_two_seq
           $q_max,     # q sequecne stop  position
           $s_min,     # s sequence start position
           $s_max,     # s sequecne stop  position
-         $args,      # argument hash
         ) = @_;
  
      # returns a chain of matches that can be chained into an alignment
@@ -83,17 +82,13 @@ sub align_two_seq
      $q_max   ||= length( ${ $q_seq } ) - 1;
      $s_max   ||= length( ${ $s_seq } ) - 1;
  
-    $args->{ "long_matches" }  ||= 50;
-    $args->{ "alph_len" }      ||= 4;
-
      my ( $wordsize, @chain, $match, $best_match, @long_matches );
  
      $matches = select_matches( $matches, $q_min, $q_max, $s_min, $s_max );
  
      if ( scalar @{ $matches } == 0 )   # no matches - find some!
      {
-        # $wordsize = find_wordsize( $q_min, $q_max, $s_min, $s_max, $args );
-        $wordsize = 4;
+        $wordsize = find_wordsize( $q_min, $q_max, $s_min, $s_max );
          $matches  = find_matches( $q_seq, $s_seq, $wordsize, $q_min, $q_max, $s_min, $s_max );
  
          while ( scalar @{ $matches } == 0 and $wordsize > 1 )
@@ -103,40 +98,17 @@ sub align_two_seq
          }
   
          if ( scalar @{ $matches } > 0 ) {
-            push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $q_max, $s_min, $s_max, $args );
+            push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $q_max, $s_min, $s_max );
          }
      }
-    elsif ( @long_matches = grep { $_->[ LEN ] >= $args->{ "long_matches" } } @{ $matches } )   # long matches found - include all that don't overlap!
-    {
-        @long_matches = order_matches( \@long_matches );
-        
-        foreach $match ( @long_matches )
-        {
-            push @chain, $match;
-            
-            if ( $match->[ Q_BEG ] - $q_min >= 2 and $match->[ S_BEG ] - $s_min >= 2 ) { 
-                push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $match->[ Q_BEG ] - 1, $s_min, $match->[ S_BEG ] - 1, $args );   # intermediate search space
-            }
-
-            $q_min = $match->[ Q_END ] + 1;
-            $s_min = $match->[ S_END ] + 1;
-        }
-
-        if ( $q_min + 1 < $q_max and $s_min + 1 < $s_max ) {
-            push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $q_max, $s_min, $s_max, $args );   # remaining search space
-        }
-    }
-    else   # shorter matches are included according to score
+    else   # matches are included according to score
      {
          foreach $match ( @{ $matches } ) {
-            # $match->[ SCORE ] = score_match( $match, $q_min, $q_max, $s_min, $s_max );
-            $match->[ SCORE ] = score_match_niels( $match, $q_seq, $q_min, $q_max, $s_min, $s_max );
+            $match->[ SCORE ] = score_match( $match, $q_min, $q_max, $s_min, $s_max );
          }
  
-        # @{ $matches } = grep { $_->[ SCORE ] > 0 } @{ $matches };
-        @{ $matches } = grep { $_->[ SCORE ] <= 0.25 } @{ $matches };
-        # @{ $matches } = sort { $b->[ SCORE ] <=> $a->[ SCORE ] } @{ $matches };
-        @{ $matches } = sort { $a->[ SCORE ] <=> $b->[ SCORE ] } @{ $matches };
+        @{ $matches } = grep { $_->[ SCORE ] > 0 } @{ $matches };
+        @{ $matches } = sort { $b->[ SCORE ] <=> $a->[ SCORE ] } @{ $matches };
  
          $best_match   = shift @{ $matches };
  
@@ -145,11 +117,11 @@ sub align_two_seq
              push @chain, $best_match;
  
              if ( $best_match->[ Q_BEG ] - $q_min >= 2 and $best_match->[ S_BEG ] - $s_min >= 2 ) {
-                push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $best_match->[ Q_BEG ] - 1, $s_min, $best_match->[ S_BEG ] - 1, $args );   # left search space
+                push @chain, align_two_seq( $q_seq, $s_seq, $matches, $q_min, $best_match->[ Q_BEG ] - 1, $s_min, $best_match->[ S_BEG ] - 1 );   # left search space
              }
  
              if ( $q_max - $best_match->[ Q_END ] >= 2 and $s_max - $best_match->[ S_END ] >= 2 ) {
-                push @chain, align_two_seq( $q_seq, $s_seq, $matches, $best_match->[ Q_END ] + 1, $q_max, $best_match->[ S_END ] + 1, $s_max, $args );   # right search space
+                push @chain, align_two_seq( $q_seq, $s_seq, $matches, $best_match->[ Q_END ] + 1, $q_max, $best_match->[ S_END ] + 1, $s_max );   # right search space
              }
          }
      }
@@ -186,53 +158,6 @@ sub select_matches
  }
  
  
-sub order_matches
-{
-    # Martin A. Hansen, October 2006
-
-    # given a list of long matches, order these by length and position
-    # and include only those long matches that does not cross.
-
-    my ( $long_matches,   # list of matches
-       ) = @_;
-
-    # returns a list of matches
-
-    my ( @matches, $match, $i );
-
-    @{ $long_matches } = sort { $b->[ LEN ] <=> $a->[ LEN ] } @{ $long_matches };
-        
-    @matches = shift @{ $long_matches };
-
-    foreach $match ( @{ $long_matches } )
-    {
-        if ( $match->[ Q_END ] < $matches[ 0 ]->[ Q_BEG ] and $match->[ S_END ] < $matches[ 0 ]->[ S_BEG ] )
-        {
-            unshift @matches, $match;
-        }
-        elsif ( $match->[ Q_BEG ] > $matches[ -1 ]->[ Q_END ] and $match->[ S_BEG ] > $matches[ -1 ]->[ S_END ] )
-        {
-            push @matches, $match; 
-        }
-        else
-        {
-            for ( $i = 1; $i < @matches; $i++ )
-            {
-                if ( $matches[ $i - 1 ]->[ Q_END ] < $match->[ Q_BEG ] and $match->[ Q_END ] < $matches[ $i ]->[ Q_BEG ] and
-                     $matches[ $i - 1 ]->[ S_END ] < $match->[ S_BEG ] and $match->[ S_END ] < $matches[ $i ]->[ S_BEG ]
-                )
-                {
-                    splice @matches, $i, 0, dclone $match;
-                    last;
-                }
-            }
-        }
-    }
-
-    return wantarray ? @matches : \@matches; 
-}
-
-
  sub find_wordsize
  {
      # Martin A. Hansen, August 2006.
@@ -245,7 +170,6 @@ sub find_wordsize
           $q_max,   # q sequecne stop position
           $s_min,   # s sequence start position
           $s_max,   # s sequecne stop position
-         $args,    # argument hash
         ) = @_;
  
      # returns integer
@@ -259,23 +183,23 @@ sub find_wordsize
  
      $wordsize = 1;
  
-    if ( $dim_min > 2000000 )                # optimized for DNA
+    if ( $dim_min > 2000000 )               # optimized for DNA
      {
-        $wordsize = $args->{ "long_matches" };
+        $wordsize = 50;
      }
-    elsif ( $dim_min > 100000 )                # optimized for DNA
+    elsif ( $dim_min > 100000 )             # optimized for DNA
      {
-        $wordsize = int( $args->{ "long_matches" } / 2 );
+        $wordsize = 25;
      }
      elsif ( $q_dim > 100 or $s_dim > 100 )  # optimized for DNA
      {
-        while ( $args->{ "alph_len" } ** $wordsize <= $q_dim * $s_dim and $wordsize < $dim_min ) {
+        while ( ALPH_LEN ** $wordsize <= $q_dim * $s_dim and $wordsize < $dim_min ) {
              $wordsize++;
          }
      }
      else
      {
-        while ( $args->{ "alph_len" } ** $wordsize <= $dim_min and $wordsize < $dim_min ) {
+        while ( ALPH_LEN ** $wordsize <= $dim_min and $wordsize < $dim_min ) {
              $wordsize++;
          }
      }
@@ -513,7 +437,6 @@ sub score_match
      $score_narrow = $max_narrow_dist;
  
      $score_total = $score_len - $score_narrow - $score_diag;
-  #  $score_total = -1 if 3 * $min_diag_dist > $match->[ LEN ];
  
      return $score_total;
  }
@@ -652,260 +575,82 @@ sub score_match_niels
  }
  
  
-sub print_alignment
+sub insert_indels
  {
-    # Martin A. Hansen, August 2006.
+    # Martin A. Hansen, June 2009.
  
-    # Routine to print an alignment in fasta format based
-    # on a list of matches and two given sequences.
+    # Given a list of matches and references to q_seq and s_seq,
+    # insert indels to form aligned sequences.
  
-    my ( $matches,  # list of matches
-         $q_head,   # query   sequence head
-         $q_seq,    # query   sequence ref
-         $s_head,   # subject sequence head
-         $s_seq,    # subject sequence ref
+    my ( $matches,   # list of matches
+         $q_seq,     # ref to query sequence
+         $s_seq,     # ref to subject sequence
         ) = @_;
  
-    my ( $q_pos, $s_pos, $q_nomatch, $q_match, $s_nomatch, $match, $q_aseq, $s_aseq, $i );
+    # Returns nothing.
+
+    my ( $q_indels, $s_indels, $match, $diff, $first );
  
      @{ $matches } = sort { $a->[ Q_BEG ] <=> $b->[ Q_BEG ] } @{ $matches };
  
-    $q_pos = 0;
-    $s_pos = 0;
+    $first    = 1;
+    $q_indels = 0;
+    $s_indels = 0;
  
-    for ( $i = 0; $i < @{ $matches }; $i++ )
-    {
-        $match = $matches->[ $i ];
-    
-        $q_nomatch = $match->[ Q_BEG ] - $q_pos;
-        $s_nomatch = $match->[ S_BEG ] - $s_pos;
+    # ---- initial indels ----
  
-        if ( $q_nomatch - $s_nomatch > 0 ) 
-        {
-            $s_aseq .= "-" x ( $q_nomatch - $s_nomatch );
-            $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ];
-            $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ];
-        }
-        elsif ( $s_nomatch - $q_nomatch > 0 )
-        {
-            $q_aseq .= "-" x ( $s_nomatch - $q_nomatch );
-            $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ];
-            $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ];
-        }
-        else
-        {
-            $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ];
-            $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ];
-        }
-    
-        $q_pos += $q_nomatch + $match->[ LEN ];    
-        $s_pos += $s_nomatch + $match->[ LEN ];    
-    }
+    $match = shift @{ $matches };
  
-    $match = $matches->[ -1 ] || [ 0, 0, 0, 0, 0 ]; 
+    substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ], uc substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ];
+    substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ], uc substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ];
  
-    $q_nomatch = length( ${ $q_seq } ) - $match->[ Q_END ];
-    $s_nomatch = length( ${ $s_seq } ) - $match->[ S_END ];
+    $diff = ( $match->[ Q_BEG ] + $q_indels ) - ( $match->[ S_BEG ] + $s_indels );
  
-    if ( $q_nomatch - $s_nomatch > 0 ) 
+    if ( $diff < 0 )
      {
-        $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ];
-        $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ];
-        $s_aseq .= "-" x ( $q_nomatch - $s_nomatch );
-    }
-    elsif ( $s_nomatch - $q_nomatch > 0 )
-    {
-        $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ];
-        $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ];
-        $q_aseq .= "-" x ( $s_nomatch - $q_nomatch );
+        substr ${ $q_seq }, 0, 0, '-' x abs $diff;
+
+        $q_indels += abs $diff;
      }
-    else
+    elsif ( $diff > 0 )
      {
-        $q_aseq .= substr ${ $q_seq }, $q_pos, $q_nomatch + $match->[ LEN ];
-        $s_aseq .= substr ${ $s_seq }, $s_pos, $s_nomatch + $match->[ LEN ];
-    }
-
-    print ">$q_head\n$q_aseq\n>$s_head\n$s_aseq\n";
-}
-
-
-sub print_matches
-{
-    # Martin A. Hansen, February 2007.
-
-    my ( $matches,  # list of matches
-         $q_head,   # query   sequence head
-         $q_seq,    # query   sequence ref
-         $s_head,   # subject sequence head
-         $s_seq,    # subject sequence ref
-         $args,     # argument hash         - OPTIONAL
-       ) = @_;
+        substr ${ $s_seq }, 0, 0, '-' x abs $diff;
  
-    $args->{ "wrap" } ||= 80;
-
-    my ( $q_pos, $s_pos, $match, $q_nomatch, $q_match, $s_nomatch, $q_aseq, $s_aseq, $pins, $i, $q, $s, $q_ruler, $s_ruler, $entries );
-
-    @{ $matches } = sort { $a->[ Q_BEG ] <=> $b->[ Q_BEG ] } @{ $matches };
+        $s_indels += abs $diff;
+    }
  
-    $q_pos = 0;
-    $s_pos = 0;
+    # ---- internal indels ----
  
-    for ( $i = 0; $i < @{ $matches }; $i++ )
+    foreach $match ( @{ $matches } )
      {
-        $match = $matches->[ $i ];
+        substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ], uc substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ];
+        substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ], uc substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ];
  
-        $q_nomatch = $match->[ Q_BEG ] - $q_pos;
-        $s_nomatch = $match->[ S_BEG ] - $s_pos;
+        $diff = ( $match->[ Q_BEG ] + $q_indels ) - ( $match->[ S_BEG ] + $s_indels );
  
-        $q = $q_pos;
-        $s = $s_pos;
-
-        if ( $q_nomatch - $s_nomatch > 0 ) 
-        {
-            $q_aseq .= substr ${ $q_seq }, $q_pos, ( $q_nomatch - $s_nomatch );
-            $s_aseq .= "-" x ( $q_nomatch - $s_nomatch );
-            $pins   .= " " x ( $q_nomatch - $s_nomatch );
-            $q += ( $q_nomatch - $s_nomatch );
-        }
-        elsif ( $s_nomatch - $q_nomatch > 0 )
+        if ( $diff < 0 )
          {
-            $s_aseq .= substr ${ $s_seq }, $s_pos, ( $s_nomatch - $q_nomatch );
-            $q_aseq .= "-" x ( $s_nomatch - $q_nomatch );
-            $pins   .= " " x ( $s_nomatch - $q_nomatch );
-            $s += ( $s_nomatch - $q_nomatch );
-        }
+            substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, 0, '-' x abs $diff;
  
-        while ( $q < $q_pos + $q_nomatch and $s < $s_pos + $s_nomatch )
-        {
-            $q_aseq .= substr ${ $q_seq }, $q, 1;
-            $s_aseq .= substr ${ $s_seq }, $s, 1;
-
-            if ( substr( ${ $q_seq }, $q, 1 ) eq substr( ${ $s_seq }, $s, 1 ) )
-            {
-                $pins .= ":";
-            } else {
-                $pins .= " ";
-            }
-
-            $q++;
-            $s++;
+            $q_indels += abs $diff;
          }
+        elsif ( $diff > 0 )
+        {
+            substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, 0, '-' x abs $diff;
  
-        $q_aseq .= substr ${ $q_seq }, $match->[ Q_BEG ], $match->[ LEN ];
-        $s_aseq .= substr ${ $s_seq }, $match->[ S_BEG ], $match->[ LEN ];
-        $pins   .= "|" x $match->[ LEN ];
-
-        $q_pos += $q_nomatch + $match->[ LEN ];
-        $s_pos += $s_nomatch + $match->[ LEN ];
-    }
-
-    $q_nomatch = length( ${ $q_seq } ) - ( $match->[ Q_END ] || 0 );
-    $s_nomatch = length( ${ $s_seq } ) - ( $match->[ S_END ] || 0 );
-
-    $q = $q_pos;
-    $s = $s_pos;
-
-    while ( $q < $q_pos + $q_nomatch and $q < length ${ $q_seq } and $s < $s_pos + $s_nomatch and $s < length ${ $s_seq } )
-    {
-        $q_aseq .= substr ${ $q_seq }, $q, 1;
-        $s_aseq .= substr ${ $s_seq }, $s, 1;
-
-        if ( substr( ${ $q_seq }, $q, 1 ) eq substr( ${ $s_seq }, $s, 1 ) ) {
-            $pins .= ":";
-        } else {
-            $pins .= " ";
+            $s_indels += abs $diff;
          }
-
-        $q++;
-        $s++;
-        $q_pos++;
-        $s_pos++;
-    }
-
-    if ( $q_nomatch - $s_nomatch > 0 ) 
-    {
-        $q_aseq .= substr ${ $q_seq }, $q_pos, ( $q_nomatch - $s_nomatch );
-        $s_aseq .= "-" x ( $q_nomatch - $s_nomatch );
-        $pins   .= " " x ( $q_nomatch - $s_nomatch );
-    }
-    elsif ( $s_nomatch - $q_nomatch > 0 )
-    {
-        $s_aseq .= substr ${ $s_seq }, $s_pos, ( $s_nomatch - $q_nomatch );
-        $q_aseq .= "-" x ( $s_nomatch - $q_nomatch );
-        $pins   .= " " x ( $s_nomatch - $q_nomatch );
      }
  
-    $q_ruler = make_ruler( $q_aseq );    
-    $s_ruler = make_ruler( $s_aseq );    
+    # ---- last indels ----
  
-    $entries = [
-        [ "ruler",     $q_ruler ],
-        [ $q_head,     $q_aseq  ],
-        [ "consensus", $pins    ],
-        [ $s_head,     $s_aseq  ],
-        [ "ruler",     $s_ruler ],
-    ];
+    $diff = length( ${ $s_seq } ) - length( ${ $q_seq } );
  
-    align_print_multi( $entries, undef, $args->{ "wrap" } )
-}
-
-
-sub make_ruler
-{
-    # Martin A. Hansen, February 2007;
-
-    # Gererates a ruler for a given sequence (with indels).
-
-    my ( $seq
-       ) = @_;
-
-    # Returns string
-
-    my ( $i, $char, $skip, $count, $gap, $tics );
-
-    $gap = 0;
-    $i   = 1;
-
-    while ( $i <= length $seq )
-    {
-        $char = substr $seq, $i - 1, 1;
- 
-        $gap++ if $char eq "-";
-
-        if ( $skip )
-        {
-            $skip--;
-        }
-        else
-        {
-            $count = $i - $gap;
-            $count = 1 if $char eq "-";
-
-            if ( $count % 100 == 0 )
-            {
-                if ( $count + length( $count ) >= length $seq )
-                {
-                    $tics .= "|";
-                }
-                else
-                {
-                    $tics .= "|" . $count;
-                    $skip = length $count;
-                }
-            }
-            elsif ( $count % 50 == 0 ) {
-                $tics .= ":";
-            } elsif ( $count % 10 == 0 ) {
-                $tics .= ".";
-            } else {
-                $tics .= " ";
-            }
-        }
-
-        $i++;
+    if ( $diff > 0 ) {
+         ${ $q_seq } .= '-' x abs $diff;
+    } else {
+         ${ $s_seq } .= '-' x abs $diff;
      }
-
-    return $tics;
  }