From 24e6535de6ee89f8e8694aa27749e2275786b6a3 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Fri, 28 Aug 2009 11:26:54 +0000 Subject: [PATCH] more work on AlignTwoSeq git-svn-id: http://biopieces.googlecode.com/svn/trunk@647 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/align_pair_seq | 10 +- code_perl/Maasha/AlignTwoSeq.pm | 218 ++++++++++++---------- code_perl/Maasha/test/test_AlignTwoSeq.pl | 126 ++++++++++++- 3 files changed, 246 insertions(+), 108 deletions(-) diff --git a/bp_bin/align_pair_seq b/bp_bin/align_pair_seq index 24b8c65..7f622f1 100755 --- a/bp_bin/align_pair_seq +++ b/bp_bin/align_pair_seq @@ -36,7 +36,7 @@ use Data::Dumper; # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -my ( $options, $in, $out, $count, $record, @records ); +my ( $options, $in, $out, $count, $record, @records, $space, $matches ); $options = Maasha::Biopieces::parse_options(); @@ -55,13 +55,9 @@ while ( $record = Maasha::Biopieces::get_record( $in ) ) if ( $count == 2 ) { - $records[ 0 ]->{ 'SEQ' } = lc $records[ 0 ]->{ 'SEQ' }; - $records[ 1 ]->{ 'SEQ' } = lc $records[ 1 ]->{ 'SEQ' }; - - my $matches = Maasha::AlignTwoSeq::align_two_seq( { 'Q_SEQ' => \$records[ 0 ]->{ 'SEQ' } , 'S_SEQ' => \$records[ 1 ]->{ 'SEQ' } }, [] ); + $matches = Maasha::AlignTwoSeq::align_two_seq( { 'Q_SEQ' => \$records[ 0 ]->{ 'SEQ' } , 'S_SEQ' => \$records[ 1 ]->{ 'SEQ' } }, [] ); - print Dumper( $matches ); - + Maasha::AlignTwoSeq::shift_case( $matches, \$records[ 0 ]->{ 'SEQ' } ,\$records[ 1 ]->{ 'SEQ' } ); Maasha::AlignTwoSeq::insert_indels( $matches, \$records[ 0 ]->{ 'SEQ' } ,\$records[ 1 ]->{ 'SEQ' } ); map { Maasha::Biopieces::put_record( $_, $out ) } @records; diff --git a/code_perl/Maasha/AlignTwoSeq.pm b/code_perl/Maasha/AlignTwoSeq.pm index 506f3a2..54234ed 100644 --- a/code_perl/Maasha/AlignTwoSeq.pm +++ b/code_perl/Maasha/AlignTwoSeq.pm @@ -371,7 +371,7 @@ sub match_expand_forward while ( $match->{ 'Q_END' } <= $space->{ 'Q_MAX' } and $match->{ 'S_END' } <= $space->{ 'S_MAX' } and - lc substr( ${ $space->{ 'Q_SEQ' } }, $match->{ 'Q_END' }, 1 ) eq lc substr( ${ $space->{ 'S_SEQ' } }, $match->{ 'S_END' }, 1 ) ) + uc substr( ${ $space->{ 'Q_SEQ' } }, $match->{ 'Q_END' }, 1 ) eq uc substr( ${ $space->{ 'S_SEQ' } }, $match->{ 'S_END' }, 1 ) ) { $match->{ 'Q_END' }++; $match->{ 'S_END' }++; @@ -400,7 +400,7 @@ sub match_expand_backward while ( $match->{ 'Q_BEG' } >= $space->{ 'Q_MIN' } and $match->{ 'S_BEG' } >= $space->{ 'S_MIN' } and - lc substr( ${ $space->{ 'Q_SEQ' } }, $match->{ 'Q_BEG' }, 1 ) eq lc substr( ${ $space->{ 'S_SEQ' } }, $match->{ 'S_BEG' }, 1 ) ) + uc substr( ${ $space->{ 'Q_SEQ' } }, $match->{ 'Q_BEG' }, 1 ) eq uc substr( ${ $space->{ 'S_SEQ' } }, $match->{ 'S_BEG' }, 1 ) ) { $match->{ 'Q_BEG'}--; $match->{ 'S_BEG'}--; @@ -507,9 +507,19 @@ sub match_score # Returns nothing. - match_score_len( $match ); - match_score_diag( $match, $space ); - match_score_narrow( $match, $space ); + my ( $score_len, $score_corner, $score_diag, $score_narrow, $score_total ); + + $score_len = match_score_len( $match ); + $score_corner = match_score_corner( $match, $space ); + $score_diag = match_score_diag( $match, $space ); + $score_narrow = 0; #match_score_narrow( $match, $space ); + + $score_total = $score_len - $score_corner - $score_diag - $score_narrow; + + # print STDERR "LEN: $score_len CORNER: $score_corner DIAG: $score_diag NARROW: $score_narrow: TOTAL: $score_total\n" if $score_total > 0; + print STDERR "LEN: $score_len CORNER: $score_corner DIAG: $score_diag NARROW: $score_narrow: TOTAL: $score_total\n"; + + $match->{ 'SCORE' } = $score_total; } @@ -522,9 +532,32 @@ sub match_score_len my ( $match, # match ) = @_; - # Returns nothing. + # Returns integer. - $match->{ 'SCORE' } = $match->{ 'LEN' } ** 3; + return $match->{ 'LEN' } * 2; +} + + +sub match_score_corner +{ + # Martin A. Hansen, August 2009 + + # Score according to distance from corner. + + my ( $match, # match + $space, # search space + ) = @_; + + # Returns integer. + + my ( $left_dist, $right_dist, $score_corner ); + + $left_dist = Maasha::Calc::dist_point2point( $space->{ 'Q_MIN' }, $space->{ 'S_MIN' }, $match->{ 'Q_BEG' } , $match->{ 'S_BEG' } ); + $right_dist = Maasha::Calc::dist_point2point( $space->{ 'Q_MAX' }, $space->{ 'S_MAX' }, $match->{ 'Q_END' } , $match->{ 'S_END' } ); + + $score_corner = Maasha::Calc::min( $left_dist, $right_dist ); + + return int $score_corner * 1.5; } @@ -538,7 +571,7 @@ sub match_score_diag $space, # search space ) = @_; - # Returns nothing. + # Returns integer. my ( $q_dim, $s_dim, $beg_diag_dist, $end_diag_dist, $min_diag_dist, $score_diag ); @@ -573,16 +606,16 @@ sub match_score_diag $end_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' }, $match->{ 'S_BEG' }, $space->{ 'Q_MAX' } - $q_dim, - $space->{ 's_max' } - $q_dim, + $space->{ 'S_MAX' } - $q_dim, $space->{ 'Q_MAX' }, $space->{ 'S_MAX' } ); } $min_diag_dist = Maasha::Calc::min( $beg_diag_dist, $end_diag_dist ); - $score_diag = 2 * $min_diag_dist ** 2; + $score_diag = $min_diag_dist; - $match->{ 'SCORE' } -= $score_diag; + return int $score_diag; } @@ -596,7 +629,7 @@ sub match_score_narrow $space, # search space ) = @_; - # Returns nothing. + # Returns integer. my ( $max_narrow_dist, $q_dim, $s_dim, $beg_narrow_dist, $end_narrow_dist, $score_narrow ); @@ -622,92 +655,124 @@ sub match_score_narrow $score_narrow = $max_narrow_dist; - $match->{ 'SCORE' } -= $score_narrow; + return abs( $beg_narrow_dist - $end_narrow_dist ); + + return $score_narrow; } -__END__ +sub shift_case +{ + # Martin A. Hansen, August 2009. + + # Given a list of matches and sequence references + # uppercase only matching sequnce while lowercasing the rest. + + my ( $matches, # list of matches + $q_seq, # query sequence ref + $s_seq, # subject sequence ref + ) = @_; + + # Returns nothing. + + my ( $match ); + + ${ $q_seq } = lc ${ $q_seq }; + ${ $s_seq } = lc ${ $s_seq }; + + foreach $match ( @{ $matches } ) + { + substr ${ $q_seq }, $match->{ 'Q_BEG' }, $match->{ 'LEN' }, uc substr ${ $q_seq }, $match->{ 'Q_BEG' }, $match->{ 'LEN' }; + substr ${ $s_seq }, $match->{ 'S_BEG' }, $match->{ 'LEN' }, uc substr ${ $s_seq }, $match->{ 'S_BEG' }, $match->{ 'LEN' }; + } +} + sub insert_indels { - # Martin A. Hansen, June 2009. + # Martin A. Hansen, August 2009. - # Given a list of matches and references to q_seq and s_seq, + # Given a list of matches and sequence references # insert indels to form aligned sequences. my ( $matches, # list of matches - $q_seq, # ref to query sequence - $s_seq, # ref to subject sequence + $q_seq, # query sequence ref + $s_seq, # subject sequence ref ) = @_; - # Returns nothing. - - my ( $q_indels, $s_indels, $match, $diff, $first ); + my ( $i, $q_dist, $s_dist, $q_indels, $s_indels, $diff ); - @{ $matches } = sort { $a->{ Q_BEG ] <=> $b->[ Q_BEG ] } @{ $matches }; - - $first = 1; $q_indels = 0; $s_indels = 0; - # ---- initial indels ---- - - $match = shift @{ $matches }; + if ( @{ $matches } > 0 ) + { + @{ $matches } = sort { $a->{ 'Q_BEG' } <=> $b->{ 'Q_BEG' } } @{ $matches }; # FIXME is this necessary? - substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ], uc substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ]; - substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ], uc substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ]; + # >>>>>>>>>>>>>> FIRST MATCH <<<<<<<<<<<<<< - $diff = ( $match->[ Q_BEG ] + $q_indels ) - ( $match->[ S_BEG ] + $s_indels ); + $diff = $matches->[ 0 ]->{ 'Q_BEG' } - $matches->[ 0 ]->{ 'S_BEG' }; - if ( $diff < 0 ) + if ( $diff > 0 ) { - substr ${ $q_seq }, 0, 0, '-' x abs $diff; + substr ${ $s_seq }, 0, 0, '-' x $diff; - $q_indels += abs $diff; + $s_indels += $diff; } - elsif ( $diff > 0 ) + elsif ( $diff < 0 ) { - substr ${ $s_seq }, 0, 0, '-' x abs $diff; + substr ${ $q_seq }, 0, 0, '-' x abs $diff; - $s_indels += abs $diff; + $q_indels += abs $diff; } - # ---- internal indels ---- + # >>>>>>>>>>>>>> MIDDLE MATCHES <<<<<<<<<<<<<< - foreach $match ( @{ $matches } ) + for ( $i = 0; $i < scalar @{ $matches } - 1; $i++ ) { - substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ], uc substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ]; - substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ], uc substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ]; + $q_dist = $matches->[ $i + 1 ]->{ 'Q_BEG' } - $matches->[ $i ]->{ 'Q_END' }; + $s_dist = $matches->[ $i + 1 ]->{ 'S_BEG' } - $matches->[ $i ]->{ 'S_END' }; - $diff = ( $match->[ Q_BEG ] + $q_indels ) - ( $match->[ S_BEG ] + $s_indels ); + $diff = $q_dist - $s_dist; - if ( $diff < 0 ) + if ( $diff > 0 ) { - substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, 0, '-' x abs $diff; + substr ${ $s_seq }, $s_indels + $matches->[ $i ]->{ 'S_END' } + 1, 0, '-' x $diff; - $q_indels += abs $diff; + $s_indels += $diff; } - elsif ( $diff > 0 ) + elsif ( $diff < 0 ) { - substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, 0, '-' x abs $diff; + substr ${ $q_seq }, $q_indels + $matches->[ $i ]->{ 'Q_END' } + 1, 0, '-' x abs $diff; - $s_indels += abs $diff; + $q_indels += abs $diff; } } + } - # ---- last indels ---- + # >>>>>>>>>>>>>> LAST MATCH <<<<<<<<<<<<<< - $diff = length( ${ $s_seq } ) - length( ${ $q_seq } ); + $diff = length( ${ $q_seq } ) - length( ${ $s_seq } ); - if ( $diff > 0 ) { - ${ $q_seq } .= '-' x abs $diff; - } else { - ${ $s_seq } .= '-' x abs $diff; + if ( $diff > 0 ) + { + ${ $s_seq } .= '-' x $diff; + + $s_indels += $diff; + } + else + { + ${ $q_seq } .= '-' x abs $diff; + + $q_indels += abs $diff; } } +__END__ + + sub align_sim_local { # Martin A. Hansen, May 2007. @@ -788,51 +853,4 @@ sub align_sim_global } -sub align_consensus -{ - # Martin A. Hansen, June 2006. - - # Given an alignment as a list of FASTA entries, - # generates a consensus sequences based on the - # entropies for each column similar to the way - # a sequence logo i calculated. Returns the - # consensus sequence as a FASTA entry. - - my ( $entries, # list of aligned FASTA entries - $type, # residue type - OPTIONAL - $min_sim, # minimum similarity - OPTIONAL - ) = @_; - - # Returns tuple - - my ( $bit_max, $data, $pos, $char, $score, $entry ); - - $type ||= Maasha::Seq::seq_guess_type( $entries->[ 0 ] ); - $min_sim ||= 50; - - if ( $type =~ /protein/ ) { - $bit_max = 4; - } else { - $bit_max = 2; - } - - $data = Maasha::Seq::seqlogo_calc( $bit_max, $entries ); - - foreach $pos ( @{ $data } ) - { - ( $char, $score ) = @{ $pos->[ -1 ] }; - - if ( ( $score / $bit_max ) * 100 >= $min_sim ) { - $entry->[ SEQ ] .= $char; - } else { - $entry->[ SEQ ] .= "-"; - } - } - - $entry->[ HEAD ] = "Consensus: $min_sim%"; - - return wantarray ? @{ $entry } : $entry; -} - - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< diff --git a/code_perl/Maasha/test/test_AlignTwoSeq.pl b/code_perl/Maasha/test/test_AlignTwoSeq.pl index 0211928..cd59552 100755 --- a/code_perl/Maasha/test/test_AlignTwoSeq.pl +++ b/code_perl/Maasha/test/test_AlignTwoSeq.pl @@ -36,6 +36,8 @@ test_match_score(); test_align_two_seq(); +test_insert_indels(); + sub test_new_space { @@ -509,9 +511,131 @@ sub test_align_two_seq $matches = Maasha::AlignTwoSeq::align_two_seq( $space, [] ); - print Dumper( $matches ); +# print Dumper( $matches ); ok( 0 ); } +sub test_insert_indels +{ + my ( $matches, $q_seq, $s_seq ); + + $matches = [ + { Q_BEG => 1, S_BEG => 1, Q_END => 4, S_END => 4 } + ]; + + $q_seq = "XATCG"; + $s_seq = "PATCG"; + + Maasha::AlignTwoSeq::insert_indels( $matches, \$q_seq, \$s_seq ); + + is( $q_seq, "XATCG" ); + is( $s_seq, "PATCG" ); + + $matches = [ + { Q_BEG => 0, S_BEG => 1, Q_END => 3, S_END => 4 } + ]; + + $q_seq = "ATCG"; + $s_seq = "PATCG"; + + Maasha::AlignTwoSeq::insert_indels( $matches, \$q_seq, \$s_seq ); + + is( $q_seq, "-ATCG" ); + is( $s_seq, "PATCG" ); + + $matches = [ + { Q_BEG => 1, S_BEG => 0, Q_END => 4, S_END => 3 } + ]; + + $q_seq = "XATCG"; + $s_seq = "ATCG"; + + Maasha::AlignTwoSeq::insert_indels( $matches, \$q_seq, \$s_seq ); + + is( $q_seq, "XATCG" ); + is( $s_seq, "-ATCG" ); + + $matches = [ + { Q_BEG => 0, S_BEG => 0, Q_END => 3, S_END => 3 }, + { Q_BEG => 6, S_BEG => 6, Q_END => 9, S_END => 9 }, + ]; + + $q_seq = "ATCGXXATCG"; + $s_seq = "ATCGNNATCG"; + + Maasha::AlignTwoSeq::insert_indels( $matches, \$q_seq, \$s_seq ); + + is( $q_seq, "ATCGXXATCG" ); + is( $s_seq, "ATCGNNATCG" ); + + $matches = [ + { Q_BEG => 0, S_BEG => 0, Q_END => 3, S_END => 3 }, + { Q_BEG => 6, S_BEG => 4, Q_END => 9, S_END => 7 }, + ]; + + $q_seq = "ATCGXXATCG"; + $s_seq = "ATCGATCG"; + + Maasha::AlignTwoSeq::insert_indels( $matches, \$q_seq, \$s_seq ); + + is( $q_seq, "ATCGXXATCG" ); + is( $s_seq, "ATCG--ATCG" ); + + $matches = [ + { Q_BEG => 0, S_BEG => 1, Q_END => 2, S_END => 3 }, + { Q_BEG => 5, S_BEG => 4, Q_END => 8, S_END => 7 }, + ]; + + $q_seq = "TCGXXATCG"; + $s_seq = "ATCGATCG"; + + Maasha::AlignTwoSeq::insert_indels( $matches, \$q_seq, \$s_seq ); + + is( $q_seq, "-TCGXXATCG" ); + is( $s_seq, "ATCG--ATCG" ); + + $matches = [ + { Q_BEG => 1, S_BEG => 0, Q_END => 3, S_END => 2 }, + { Q_BEG => 6, S_BEG => 3, Q_END => 9, S_END => 6 }, + ]; + + $q_seq = "ATCGXXATCG"; + $s_seq = "TCGATCG"; + + Maasha::AlignTwoSeq::insert_indels( $matches, \$q_seq, \$s_seq ); + + is( $q_seq, "ATCGXXATCG" ); + is( $s_seq, "-TCG--ATCG" ); + + $matches = [ + { Q_BEG => 1, Q_END => 3, S_BEG => 0, S_END => 2 }, + { Q_BEG => 6, Q_END => 7, S_BEG => 3, S_END => 4 }, + { Q_BEG => 9, Q_END => 10, S_BEG => 7, S_END => 8 }, + ]; + + $q_seq = "ATCGXXATACG"; + $s_seq = "TCGATNTCG"; + + Maasha::AlignTwoSeq::insert_indels( $matches, \$q_seq, \$s_seq ); + + is( $q_seq, "ATCGXXAT-ACG" ); + # ||| || || + is( $s_seq, "-TCG--ATNTCG" ); + + $matches = [ + { Q_BEG => 1, Q_END => 3, S_BEG => 0, S_END => 2 }, + { Q_BEG => 6, Q_END => 7, S_BEG => 3, S_END => 4 }, + { Q_BEG => 9, Q_END => 10, S_BEG => 7, S_END => 8 }, + ]; + + $q_seq = "ATCGXXATACG"; + $s_seq = "TCGATNTCGXX"; + + Maasha::AlignTwoSeq::insert_indels( $matches, \$q_seq, \$s_seq ); + + is( $q_seq, "ATCGXXAT-ACG--" ); + # ||| || || + is( $s_seq, "-TCG--ATNTCGXX" ); +} -- 2.39.5