From 24e6535de6ee89f8e8694aa27749e2275786b6a3 Mon Sep 17 00:00:00 2001
From: martinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Date: Fri, 28 Aug 2009 11:26:54 +0000
Subject: [PATCH] more work on AlignTwoSeq

git-svn-id: http://biopieces.googlecode.com/svn/trunk@647 74ccb610-7750-0410-82ae-013aeee3265d
---
 bp_bin/align_pair_seq                     |  10 +-
 code_perl/Maasha/AlignTwoSeq.pm           | 218 ++++++++++++----------
 code_perl/Maasha/test/test_AlignTwoSeq.pl | 126 ++++++++++++-
 3 files changed, 246 insertions(+), 108 deletions(-)

diff --git a/bp_bin/align_pair_seq b/bp_bin/align_pair_seq
index 24b8c65..7f622f1 100755
--- a/bp_bin/align_pair_seq
+++ b/bp_bin/align_pair_seq
@@ -36,7 +36,7 @@ use Data::Dumper;
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
 
-my ( $options, $in, $out, $count, $record, @records );
+my ( $options, $in, $out, $count, $record, @records, $space, $matches );
 
 $options = Maasha::Biopieces::parse_options();
 
@@ -55,13 +55,9 @@ while ( $record = Maasha::Biopieces::get_record( $in ) )
 
         if ( $count == 2 )
         {
-            $records[ 0 ]->{ 'SEQ' } = lc $records[ 0 ]->{ 'SEQ' };
-            $records[ 1 ]->{ 'SEQ' } = lc $records[ 1 ]->{ 'SEQ' };
-
-            my $matches = Maasha::AlignTwoSeq::align_two_seq( { 'Q_SEQ' => \$records[ 0 ]->{ 'SEQ' } , 'S_SEQ' => \$records[ 1 ]->{ 'SEQ' } }, [] );
+            $matches = Maasha::AlignTwoSeq::align_two_seq( { 'Q_SEQ' => \$records[ 0 ]->{ 'SEQ' } , 'S_SEQ' => \$records[ 1 ]->{ 'SEQ' } }, [] );
             
-            print Dumper( $matches );
-
+            Maasha::AlignTwoSeq::shift_case( $matches, \$records[ 0 ]->{ 'SEQ' } ,\$records[ 1 ]->{ 'SEQ' } );
             Maasha::AlignTwoSeq::insert_indels( $matches, \$records[ 0 ]->{ 'SEQ' } ,\$records[ 1 ]->{ 'SEQ' } );
 
             map { Maasha::Biopieces::put_record( $_, $out ) } @records;
diff --git a/code_perl/Maasha/AlignTwoSeq.pm b/code_perl/Maasha/AlignTwoSeq.pm
index 506f3a2..54234ed 100644
--- a/code_perl/Maasha/AlignTwoSeq.pm
+++ b/code_perl/Maasha/AlignTwoSeq.pm
@@ -371,7 +371,7 @@ sub match_expand_forward
 
     while ( $match->{ 'Q_END' } <= $space->{ 'Q_MAX' } and
             $match->{ 'S_END' } <= $space->{ 'S_MAX' } and 
-            lc substr( ${ $space->{ 'Q_SEQ' } }, $match->{ 'Q_END' }, 1 ) eq lc substr( ${ $space->{ 'S_SEQ' } }, $match->{ 'S_END' }, 1 ) )
+            uc substr( ${ $space->{ 'Q_SEQ' } }, $match->{ 'Q_END' }, 1 ) eq uc substr( ${ $space->{ 'S_SEQ' } }, $match->{ 'S_END' }, 1 ) )
     {
         $match->{ 'Q_END' }++;
         $match->{ 'S_END' }++;
@@ -400,7 +400,7 @@ sub match_expand_backward
 
     while ( $match->{ 'Q_BEG' } >= $space->{ 'Q_MIN' } and
             $match->{ 'S_BEG' } >= $space->{ 'S_MIN' } and 
-            lc substr( ${ $space->{ 'Q_SEQ' } }, $match->{ 'Q_BEG' }, 1 ) eq lc substr( ${ $space->{ 'S_SEQ' } }, $match->{ 'S_BEG' }, 1 ) )
+            uc substr( ${ $space->{ 'Q_SEQ' } }, $match->{ 'Q_BEG' }, 1 ) eq uc substr( ${ $space->{ 'S_SEQ' } }, $match->{ 'S_BEG' }, 1 ) )
     {
         $match->{ 'Q_BEG'}--;
         $match->{ 'S_BEG'}--;
@@ -507,9 +507,19 @@ sub match_score
 
     # Returns nothing.
 
-    match_score_len( $match );
-    match_score_diag( $match, $space );
-    match_score_narrow( $match, $space );
+    my ( $score_len, $score_corner, $score_diag, $score_narrow, $score_total );
+
+    $score_len    = match_score_len( $match );
+    $score_corner = match_score_corner( $match, $space );
+    $score_diag   = match_score_diag( $match, $space );
+    $score_narrow = 0; #match_score_narrow( $match, $space );
+
+    $score_total  = $score_len - $score_corner - $score_diag - $score_narrow;
+
+    # print STDERR "LEN: $score_len   CORNER: $score_corner   DIAG: $score_diag   NARROW: $score_narrow: TOTAL: $score_total\n" if $score_total > 0;
+    print STDERR "LEN: $score_len   CORNER: $score_corner   DIAG: $score_diag   NARROW: $score_narrow: TOTAL: $score_total\n";
+
+    $match->{ 'SCORE' } = $score_total;
 }
 
 
@@ -522,9 +532,32 @@ sub match_score_len
     my ( $match,   # match
        ) = @_;
 
-    # Returns nothing.
+    # Returns integer.
 
-    $match->{ 'SCORE' } = $match->{ 'LEN' } ** 3;
+    return $match->{ 'LEN' } * 2;
+}
+
+
+sub match_score_corner
+{
+    # Martin A. Hansen, August 2009
+
+    # Score according to distance from corner.
+
+    my ( $match,   # match
+         $space,   # search space
+       ) = @_;
+
+    # Returns integer.
+
+    my ( $left_dist, $right_dist, $score_corner );
+
+    $left_dist  = Maasha::Calc::dist_point2point( $space->{ 'Q_MIN' }, $space->{ 'S_MIN' }, $match->{ 'Q_BEG' } , $match->{ 'S_BEG' } );
+    $right_dist = Maasha::Calc::dist_point2point( $space->{ 'Q_MAX' }, $space->{ 'S_MAX' }, $match->{ 'Q_END' } , $match->{ 'S_END' } );
+
+    $score_corner = Maasha::Calc::min( $left_dist, $right_dist );
+
+    return int $score_corner * 1.5;
 }
 
 
@@ -538,7 +571,7 @@ sub match_score_diag
          $space,   # search space
        ) = @_;
 
-    # Returns nothing.
+    # Returns integer.
 
     my ( $q_dim, $s_dim, $beg_diag_dist, $end_diag_dist, $min_diag_dist, $score_diag );
 
@@ -573,16 +606,16 @@ sub match_score_diag
         $end_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' },
                                                         $match->{ 'S_BEG' },
                                                         $space->{ 'Q_MAX' } - $q_dim,
-                                                        $space->{ 's_max' } - $q_dim,
+                                                        $space->{ 'S_MAX' } - $q_dim,
                                                         $space->{ 'Q_MAX' },
                                                         $space->{ 'S_MAX' } );
     }
 
     $min_diag_dist = Maasha::Calc::min( $beg_diag_dist, $end_diag_dist );
 
-    $score_diag = 2 * $min_diag_dist ** 2;
+    $score_diag = $min_diag_dist;
 
-    $match->{ 'SCORE' } -= $score_diag;
+    return int $score_diag;
 }
 
 
@@ -596,7 +629,7 @@ sub match_score_narrow
          $space,   # search space
        ) = @_;
 
-    # Returns nothing.
+    # Returns integer.
 
     my ( $max_narrow_dist, $q_dim, $s_dim, $beg_narrow_dist, $end_narrow_dist, $score_narrow );
 
@@ -622,92 +655,124 @@ sub match_score_narrow
 
     $score_narrow = $max_narrow_dist;
 
-    $match->{ 'SCORE' } -= $score_narrow;
+    return abs( $beg_narrow_dist - $end_narrow_dist );
+
+    return $score_narrow;
 }
 
 
-__END__
+sub shift_case
+{
+    # Martin A. Hansen, August 2009.
+
+    # Given a list of matches and sequence references
+    # uppercase only matching sequnce while lowercasing the rest.
+
+    my ( $matches,   # list of matches
+         $q_seq,     # query sequence ref
+         $s_seq,     # subject sequence ref
+       ) = @_;
+
+    # Returns nothing.
+
+    my ( $match );
+
+    ${ $q_seq } = lc ${ $q_seq };
+    ${ $s_seq } = lc ${ $s_seq };
+
+    foreach $match ( @{ $matches } )
+    {
+        substr ${ $q_seq }, $match->{ 'Q_BEG' }, $match->{ 'LEN' }, uc substr ${ $q_seq }, $match->{ 'Q_BEG' }, $match->{ 'LEN' };
+        substr ${ $s_seq }, $match->{ 'S_BEG' }, $match->{ 'LEN' }, uc substr ${ $s_seq }, $match->{ 'S_BEG' }, $match->{ 'LEN' };
+    }
+}
+
 
 
 sub insert_indels
 {
-    # Martin A. Hansen, June 2009.
+    # Martin A. Hansen, August 2009.
 
-    # Given a list of matches and references to q_seq and s_seq,
+    # Given a list of matches and sequence references
     # insert indels to form aligned sequences.
 
     my ( $matches,   # list of matches
-         $q_seq,     # ref to query sequence
-         $s_seq,     # ref to subject sequence
+         $q_seq,     # query sequence ref
+         $s_seq,     # subject sequence ref
        ) = @_;
 
-    # Returns nothing.
-
-    my ( $q_indels, $s_indels, $match, $diff, $first );
+    my ( $i, $q_dist, $s_dist, $q_indels, $s_indels, $diff );
 
-    @{ $matches } = sort { $a->{ Q_BEG ] <=> $b->[ Q_BEG ] } @{ $matches };
-
-    $first    = 1;
     $q_indels = 0;
     $s_indels = 0;
 
-    # ---- initial indels ----
-
-    $match = shift @{ $matches };
+    if ( @{ $matches } > 0 )
+    {
+    @{ $matches } = sort { $a->{ 'Q_BEG' } <=> $b->{ 'Q_BEG' } } @{ $matches };  # FIXME is this necessary?
 
-    substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ], uc substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ];
-    substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ], uc substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ];
+    # >>>>>>>>>>>>>> FIRST MATCH <<<<<<<<<<<<<<
 
-    $diff = ( $match->[ Q_BEG ] + $q_indels ) - ( $match->[ S_BEG ] + $s_indels );
+    $diff = $matches->[ 0 ]->{ 'Q_BEG' } - $matches->[ 0 ]->{ 'S_BEG' };
 
-    if ( $diff < 0 )
+    if ( $diff > 0 )
     {
-        substr ${ $q_seq }, 0, 0, '-' x abs $diff;
+        substr ${ $s_seq }, 0, 0, '-' x $diff;
 
-        $q_indels += abs $diff;
+        $s_indels += $diff;
     }
-    elsif ( $diff > 0 )
+    elsif ( $diff < 0 )
     {
-        substr ${ $s_seq }, 0, 0, '-' x abs $diff;
+        substr ${ $q_seq }, 0, 0, '-' x abs $diff;
 
-        $s_indels += abs $diff;
+        $q_indels += abs $diff;
     }
 
-    # ---- internal indels ----
+    # >>>>>>>>>>>>>> MIDDLE MATCHES <<<<<<<<<<<<<<
 
-    foreach $match ( @{ $matches } )
+    for ( $i = 0; $i < scalar @{ $matches } - 1; $i++ )
     {
-        substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ], uc substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ];
-        substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ], uc substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ];
+        $q_dist = $matches->[ $i + 1 ]->{ 'Q_BEG' } - $matches->[ $i ]->{ 'Q_END' };
+        $s_dist = $matches->[ $i + 1 ]->{ 'S_BEG' } - $matches->[ $i ]->{ 'S_END' };
 
-        $diff = ( $match->[ Q_BEG ] + $q_indels ) - ( $match->[ S_BEG ] + $s_indels );
+        $diff   = $q_dist - $s_dist;
 
-        if ( $diff < 0 )
+        if ( $diff > 0 )
         {
-            substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, 0, '-' x abs $diff;
+            substr ${ $s_seq }, $s_indels + $matches->[ $i ]->{ 'S_END' } + 1, 0, '-' x $diff;
 
-            $q_indels += abs $diff;
+            $s_indels += $diff;
         }
-        elsif ( $diff > 0 )
+        elsif ( $diff < 0 )
         {
-            substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, 0, '-' x abs $diff;
+            substr ${ $q_seq }, $q_indels + $matches->[ $i ]->{ 'Q_END' } + 1, 0, '-' x abs $diff;
 
-            $s_indels += abs $diff;
+            $q_indels += abs $diff;
         }
     }
+    }
 
-    # ---- last indels ----
+    # >>>>>>>>>>>>>> LAST MATCH <<<<<<<<<<<<<<
 
-    $diff = length( ${ $s_seq } ) - length( ${ $q_seq } );
+    $diff = length( ${ $q_seq } ) - length( ${ $s_seq } );
 
-    if ( $diff > 0 ) {
-         ${ $q_seq } .= '-' x abs $diff;
-    } else {
-         ${ $s_seq } .= '-' x abs $diff;
+    if ( $diff > 0 )
+    {
+        ${ $s_seq } .= '-' x $diff;
+
+        $s_indels += $diff;
+    }
+    else
+    {
+        ${ $q_seq } .= '-' x abs $diff;
+
+        $q_indels += abs $diff;
     }
 }
 
 
+__END__
+
+
 sub align_sim_local
 {
     # Martin A. Hansen, May 2007.
@@ -788,51 +853,4 @@ sub align_sim_global
 }
 
 
-sub align_consensus
-{
-    # Martin A. Hansen, June 2006.
-
-    # Given an alignment as a list of FASTA entries,
-    # generates a consensus sequences based on the
-    # entropies for each column similar to the way
-    # a sequence logo i calculated. Returns the
-    # consensus sequence as a FASTA entry.
-
-    my ( $entries,   # list of aligned FASTA entries
-         $type,      # residue type - OPTIONAL
-         $min_sim,   # minimum similarity - OPTIONAL
-       ) = @_;
-
-    # Returns tuple
-
-    my ( $bit_max, $data, $pos, $char, $score, $entry );
-
-    $type    ||= Maasha::Seq::seq_guess_type( $entries->[ 0 ] );
-    $min_sim ||= 50;
-
-    if ( $type =~ /protein/ ) {
-        $bit_max = 4;   
-    } else {
-        $bit_max = 2;
-    }
-
-    $data = Maasha::Seq::seqlogo_calc( $bit_max, $entries );
-
-    foreach $pos ( @{ $data } )
-    {
-        ( $char, $score ) = @{ $pos->[ -1 ] };
-        
-        if ( ( $score / $bit_max ) * 100 >= $min_sim ) {
-            $entry->[ SEQ ] .= $char;
-        } else {
-            $entry->[ SEQ ] .= "-";
-        }
-    }
-
-    $entry->[ HEAD ] = "Consensus: $min_sim%";
-
-    return wantarray ? @{ $entry } : $entry;
-}
-
-
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
diff --git a/code_perl/Maasha/test/test_AlignTwoSeq.pl b/code_perl/Maasha/test/test_AlignTwoSeq.pl
index 0211928..cd59552 100755
--- a/code_perl/Maasha/test/test_AlignTwoSeq.pl
+++ b/code_perl/Maasha/test/test_AlignTwoSeq.pl
@@ -36,6 +36,8 @@ test_match_score();
 
 test_align_two_seq();
 
+test_insert_indels();
+
 
 sub test_new_space
 {
@@ -509,9 +511,131 @@ sub test_align_two_seq
 
     $matches = Maasha::AlignTwoSeq::align_two_seq( $space, [] );
 
-    print Dumper( $matches );
+#    print Dumper( $matches );
 
     ok( 0 );
 }
 
 
+sub test_insert_indels
+{
+    my ( $matches, $q_seq, $s_seq );
+
+    $matches = [
+        { Q_BEG => 1, S_BEG => 1, Q_END => 4, S_END => 4 }
+    ];
+
+    $q_seq = "XATCG";
+    $s_seq = "PATCG";
+
+    Maasha::AlignTwoSeq::insert_indels( $matches, \$q_seq, \$s_seq );
+
+    is( $q_seq, "XATCG" );
+    is( $s_seq, "PATCG" );
+
+    $matches = [
+        { Q_BEG => 0, S_BEG => 1, Q_END => 3, S_END => 4 }
+    ];
+
+    $q_seq = "ATCG";
+    $s_seq = "PATCG";
+
+    Maasha::AlignTwoSeq::insert_indels( $matches, \$q_seq, \$s_seq );
+
+    is( $q_seq, "-ATCG" );
+    is( $s_seq, "PATCG" );
+
+    $matches = [
+        { Q_BEG => 1, S_BEG => 0, Q_END => 4, S_END => 3 }
+    ];
+
+    $q_seq = "XATCG";
+    $s_seq = "ATCG";
+
+    Maasha::AlignTwoSeq::insert_indels( $matches, \$q_seq, \$s_seq );
+
+    is( $q_seq, "XATCG" );
+    is( $s_seq, "-ATCG" );
+
+    $matches = [
+        { Q_BEG => 0, S_BEG => 0, Q_END => 3, S_END => 3 },
+        { Q_BEG => 6, S_BEG => 6, Q_END => 9, S_END => 9 },
+    ];
+
+    $q_seq = "ATCGXXATCG";
+    $s_seq = "ATCGNNATCG";
+
+    Maasha::AlignTwoSeq::insert_indels( $matches, \$q_seq, \$s_seq );
+
+    is( $q_seq, "ATCGXXATCG" );
+    is( $s_seq, "ATCGNNATCG" );
+
+    $matches = [
+        { Q_BEG => 0, S_BEG => 0, Q_END => 3, S_END => 3 },
+        { Q_BEG => 6, S_BEG => 4, Q_END => 9, S_END => 7 },
+    ];
+
+    $q_seq = "ATCGXXATCG";
+    $s_seq = "ATCGATCG";
+
+    Maasha::AlignTwoSeq::insert_indels( $matches, \$q_seq, \$s_seq );
+
+    is( $q_seq, "ATCGXXATCG" );
+    is( $s_seq, "ATCG--ATCG" );
+
+    $matches = [
+        { Q_BEG => 0, S_BEG => 1, Q_END => 2, S_END => 3 },
+        { Q_BEG => 5, S_BEG => 4, Q_END => 8, S_END => 7 },
+    ];
+
+    $q_seq = "TCGXXATCG";
+    $s_seq = "ATCGATCG";
+
+    Maasha::AlignTwoSeq::insert_indels( $matches, \$q_seq, \$s_seq );
+
+    is( $q_seq, "-TCGXXATCG" );
+    is( $s_seq, "ATCG--ATCG" );
+
+    $matches = [
+        { Q_BEG => 1, S_BEG => 0, Q_END => 3, S_END => 2 },
+        { Q_BEG => 6, S_BEG => 3, Q_END => 9, S_END => 6 },
+    ];
+
+    $q_seq = "ATCGXXATCG";
+    $s_seq = "TCGATCG";
+
+    Maasha::AlignTwoSeq::insert_indels( $matches, \$q_seq, \$s_seq );
+
+    is( $q_seq, "ATCGXXATCG" );
+    is( $s_seq, "-TCG--ATCG" );
+
+    $matches = [
+        { Q_BEG => 1, Q_END =>  3, S_BEG => 0, S_END => 2 },
+        { Q_BEG => 6, Q_END =>  7, S_BEG => 3, S_END => 4 },
+        { Q_BEG => 9, Q_END => 10, S_BEG => 7, S_END => 8 },
+    ];
+
+    $q_seq = "ATCGXXATACG";
+    $s_seq = "TCGATNTCG";
+
+    Maasha::AlignTwoSeq::insert_indels( $matches, \$q_seq, \$s_seq );
+
+    is( $q_seq, "ATCGXXAT-ACG" );
+    #             |||  ||  ||
+    is( $s_seq, "-TCG--ATNTCG" );
+
+    $matches = [
+        { Q_BEG => 1, Q_END =>  3, S_BEG => 0, S_END => 2 },
+        { Q_BEG => 6, Q_END =>  7, S_BEG => 3, S_END => 4 },
+        { Q_BEG => 9, Q_END => 10, S_BEG => 7, S_END => 8 },
+    ];
+
+    $q_seq = "ATCGXXATACG";
+    $s_seq = "TCGATNTCGXX";
+
+    Maasha::AlignTwoSeq::insert_indels( $matches, \$q_seq, \$s_seq );
+
+    is( $q_seq, "ATCGXXAT-ACG--" );
+    #             |||  ||  ||
+    is( $s_seq, "-TCG--ATNTCGXX" );
+}
-- 
2.39.5