while ( $match->{ 'Q_END' } <= $space->{ 'Q_MAX' } and
$match->{ 'S_END' } <= $space->{ 'S_MAX' } and
- lc substr( ${ $space->{ 'Q_SEQ' } }, $match->{ 'Q_END' }, 1 ) eq lc substr( ${ $space->{ 'S_SEQ' } }, $match->{ 'S_END' }, 1 ) )
+ uc substr( ${ $space->{ 'Q_SEQ' } }, $match->{ 'Q_END' }, 1 ) eq uc substr( ${ $space->{ 'S_SEQ' } }, $match->{ 'S_END' }, 1 ) )
{
$match->{ 'Q_END' }++;
$match->{ 'S_END' }++;
while ( $match->{ 'Q_BEG' } >= $space->{ 'Q_MIN' } and
$match->{ 'S_BEG' } >= $space->{ 'S_MIN' } and
- lc substr( ${ $space->{ 'Q_SEQ' } }, $match->{ 'Q_BEG' }, 1 ) eq lc substr( ${ $space->{ 'S_SEQ' } }, $match->{ 'S_BEG' }, 1 ) )
+ uc substr( ${ $space->{ 'Q_SEQ' } }, $match->{ 'Q_BEG' }, 1 ) eq uc substr( ${ $space->{ 'S_SEQ' } }, $match->{ 'S_BEG' }, 1 ) )
{
$match->{ 'Q_BEG'}--;
$match->{ 'S_BEG'}--;
# Returns nothing.
- match_score_len( $match );
- match_score_diag( $match, $space );
- match_score_narrow( $match, $space );
+ my ( $score_len, $score_corner, $score_diag, $score_narrow, $score_total );
+
+ $score_len = match_score_len( $match );
+ $score_corner = match_score_corner( $match, $space );
+ $score_diag = match_score_diag( $match, $space );
+ $score_narrow = 0; #match_score_narrow( $match, $space );
+
+ $score_total = $score_len - $score_corner - $score_diag - $score_narrow;
+
+ # print STDERR "LEN: $score_len CORNER: $score_corner DIAG: $score_diag NARROW: $score_narrow: TOTAL: $score_total\n" if $score_total > 0;
+ print STDERR "LEN: $score_len CORNER: $score_corner DIAG: $score_diag NARROW: $score_narrow: TOTAL: $score_total\n";
+
+ $match->{ 'SCORE' } = $score_total;
}
my ( $match, # match
) = @_;
- # Returns nothing.
+ # Returns integer.
- $match->{ 'SCORE' } = $match->{ 'LEN' } ** 3;
+ return $match->{ 'LEN' } * 2;
+}
+
+
+sub match_score_corner
+{
+ # Martin A. Hansen, August 2009
+
+ # Score according to distance from corner.
+
+ my ( $match, # match
+ $space, # search space
+ ) = @_;
+
+ # Returns integer.
+
+ my ( $left_dist, $right_dist, $score_corner );
+
+ $left_dist = Maasha::Calc::dist_point2point( $space->{ 'Q_MIN' }, $space->{ 'S_MIN' }, $match->{ 'Q_BEG' } , $match->{ 'S_BEG' } );
+ $right_dist = Maasha::Calc::dist_point2point( $space->{ 'Q_MAX' }, $space->{ 'S_MAX' }, $match->{ 'Q_END' } , $match->{ 'S_END' } );
+
+ $score_corner = Maasha::Calc::min( $left_dist, $right_dist );
+
+ return int $score_corner * 1.5;
}
$space, # search space
) = @_;
- # Returns nothing.
+ # Returns integer.
my ( $q_dim, $s_dim, $beg_diag_dist, $end_diag_dist, $min_diag_dist, $score_diag );
$end_diag_dist = Maasha::Calc::dist_point2line( $match->{ 'Q_BEG' },
$match->{ 'S_BEG' },
$space->{ 'Q_MAX' } - $q_dim,
- $space->{ 's_max' } - $q_dim,
+ $space->{ 'S_MAX' } - $q_dim,
$space->{ 'Q_MAX' },
$space->{ 'S_MAX' } );
}
$min_diag_dist = Maasha::Calc::min( $beg_diag_dist, $end_diag_dist );
- $score_diag = 2 * $min_diag_dist ** 2;
+ $score_diag = $min_diag_dist;
- $match->{ 'SCORE' } -= $score_diag;
+ return int $score_diag;
}
$space, # search space
) = @_;
- # Returns nothing.
+ # Returns integer.
my ( $max_narrow_dist, $q_dim, $s_dim, $beg_narrow_dist, $end_narrow_dist, $score_narrow );
$score_narrow = $max_narrow_dist;
- $match->{ 'SCORE' } -= $score_narrow;
+ return abs( $beg_narrow_dist - $end_narrow_dist );
+
+ return $score_narrow;
}
-__END__
+sub shift_case
+{
+ # Martin A. Hansen, August 2009.
+
+ # Given a list of matches and sequence references
+ # uppercase only matching sequnce while lowercasing the rest.
+
+ my ( $matches, # list of matches
+ $q_seq, # query sequence ref
+ $s_seq, # subject sequence ref
+ ) = @_;
+
+ # Returns nothing.
+
+ my ( $match );
+
+ ${ $q_seq } = lc ${ $q_seq };
+ ${ $s_seq } = lc ${ $s_seq };
+
+ foreach $match ( @{ $matches } )
+ {
+ substr ${ $q_seq }, $match->{ 'Q_BEG' }, $match->{ 'LEN' }, uc substr ${ $q_seq }, $match->{ 'Q_BEG' }, $match->{ 'LEN' };
+ substr ${ $s_seq }, $match->{ 'S_BEG' }, $match->{ 'LEN' }, uc substr ${ $s_seq }, $match->{ 'S_BEG' }, $match->{ 'LEN' };
+ }
+}
+
sub insert_indels
{
- # Martin A. Hansen, June 2009.
+ # Martin A. Hansen, August 2009.
- # Given a list of matches and references to q_seq and s_seq,
+ # Given a list of matches and sequence references
# insert indels to form aligned sequences.
my ( $matches, # list of matches
- $q_seq, # ref to query sequence
- $s_seq, # ref to subject sequence
+ $q_seq, # query sequence ref
+ $s_seq, # subject sequence ref
) = @_;
- # Returns nothing.
-
- my ( $q_indels, $s_indels, $match, $diff, $first );
+ my ( $i, $q_dist, $s_dist, $q_indels, $s_indels, $diff );
- @{ $matches } = sort { $a->{ Q_BEG ] <=> $b->[ Q_BEG ] } @{ $matches };
-
- $first = 1;
$q_indels = 0;
$s_indels = 0;
- # ---- initial indels ----
-
- $match = shift @{ $matches };
+ if ( @{ $matches } > 0 )
+ {
+ @{ $matches } = sort { $a->{ 'Q_BEG' } <=> $b->{ 'Q_BEG' } } @{ $matches }; # FIXME is this necessary?
- substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ], uc substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ];
- substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ], uc substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ];
+ # >>>>>>>>>>>>>> FIRST MATCH <<<<<<<<<<<<<<
- $diff = ( $match->[ Q_BEG ] + $q_indels ) - ( $match->[ S_BEG ] + $s_indels );
+ $diff = $matches->[ 0 ]->{ 'Q_BEG' } - $matches->[ 0 ]->{ 'S_BEG' };
- if ( $diff < 0 )
+ if ( $diff > 0 )
{
- substr ${ $q_seq }, 0, 0, '-' x abs $diff;
+ substr ${ $s_seq }, 0, 0, '-' x $diff;
- $q_indels += abs $diff;
+ $s_indels += $diff;
}
- elsif ( $diff > 0 )
+ elsif ( $diff < 0 )
{
- substr ${ $s_seq }, 0, 0, '-' x abs $diff;
+ substr ${ $q_seq }, 0, 0, '-' x abs $diff;
- $s_indels += abs $diff;
+ $q_indels += abs $diff;
}
- # ---- internal indels ----
+ # >>>>>>>>>>>>>> MIDDLE MATCHES <<<<<<<<<<<<<<
- foreach $match ( @{ $matches } )
+ for ( $i = 0; $i < scalar @{ $matches } - 1; $i++ )
{
- substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ], uc substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, $match->[ LEN ];
- substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ], uc substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, $match->[ LEN ];
+ $q_dist = $matches->[ $i + 1 ]->{ 'Q_BEG' } - $matches->[ $i ]->{ 'Q_END' };
+ $s_dist = $matches->[ $i + 1 ]->{ 'S_BEG' } - $matches->[ $i ]->{ 'S_END' };
- $diff = ( $match->[ Q_BEG ] + $q_indels ) - ( $match->[ S_BEG ] + $s_indels );
+ $diff = $q_dist - $s_dist;
- if ( $diff < 0 )
+ if ( $diff > 0 )
{
- substr ${ $q_seq }, $match->[ Q_BEG ] + $q_indels, 0, '-' x abs $diff;
+ substr ${ $s_seq }, $s_indels + $matches->[ $i ]->{ 'S_END' } + 1, 0, '-' x $diff;
- $q_indels += abs $diff;
+ $s_indels += $diff;
}
- elsif ( $diff > 0 )
+ elsif ( $diff < 0 )
{
- substr ${ $s_seq }, $match->[ S_BEG ] + $s_indels, 0, '-' x abs $diff;
+ substr ${ $q_seq }, $q_indels + $matches->[ $i ]->{ 'Q_END' } + 1, 0, '-' x abs $diff;
- $s_indels += abs $diff;
+ $q_indels += abs $diff;
}
}
+ }
- # ---- last indels ----
+ # >>>>>>>>>>>>>> LAST MATCH <<<<<<<<<<<<<<
- $diff = length( ${ $s_seq } ) - length( ${ $q_seq } );
+ $diff = length( ${ $q_seq } ) - length( ${ $s_seq } );
- if ( $diff > 0 ) {
- ${ $q_seq } .= '-' x abs $diff;
- } else {
- ${ $s_seq } .= '-' x abs $diff;
+ if ( $diff > 0 )
+ {
+ ${ $s_seq } .= '-' x $diff;
+
+ $s_indels += $diff;
+ }
+ else
+ {
+ ${ $q_seq } .= '-' x abs $diff;
+
+ $q_indels += abs $diff;
}
}
+__END__
+
+
sub align_sim_local
{
# Martin A. Hansen, May 2007.
}
-sub align_consensus
-{
- # Martin A. Hansen, June 2006.
-
- # Given an alignment as a list of FASTA entries,
- # generates a consensus sequences based on the
- # entropies for each column similar to the way
- # a sequence logo i calculated. Returns the
- # consensus sequence as a FASTA entry.
-
- my ( $entries, # list of aligned FASTA entries
- $type, # residue type - OPTIONAL
- $min_sim, # minimum similarity - OPTIONAL
- ) = @_;
-
- # Returns tuple
-
- my ( $bit_max, $data, $pos, $char, $score, $entry );
-
- $type ||= Maasha::Seq::seq_guess_type( $entries->[ 0 ] );
- $min_sim ||= 50;
-
- if ( $type =~ /protein/ ) {
- $bit_max = 4;
- } else {
- $bit_max = 2;
- }
-
- $data = Maasha::Seq::seqlogo_calc( $bit_max, $entries );
-
- foreach $pos ( @{ $data } )
- {
- ( $char, $score ) = @{ $pos->[ -1 ] };
-
- if ( ( $score / $bit_max ) * 100 >= $min_sim ) {
- $entry->[ SEQ ] .= $char;
- } else {
- $entry->[ SEQ ] .= "-";
- }
- }
-
- $entry->[ HEAD ] = "Consensus: $min_sim%";
-
- return wantarray ? @{ $entry } : $entry;
-}
-
-
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
test_align_two_seq();
+test_insert_indels();
+
sub test_new_space
{
$matches = Maasha::AlignTwoSeq::align_two_seq( $space, [] );
- print Dumper( $matches );
+# print Dumper( $matches );
ok( 0 );
}
+sub test_insert_indels
+{
+ my ( $matches, $q_seq, $s_seq );
+
+ $matches = [
+ { Q_BEG => 1, S_BEG => 1, Q_END => 4, S_END => 4 }
+ ];
+
+ $q_seq = "XATCG";
+ $s_seq = "PATCG";
+
+ Maasha::AlignTwoSeq::insert_indels( $matches, \$q_seq, \$s_seq );
+
+ is( $q_seq, "XATCG" );
+ is( $s_seq, "PATCG" );
+
+ $matches = [
+ { Q_BEG => 0, S_BEG => 1, Q_END => 3, S_END => 4 }
+ ];
+
+ $q_seq = "ATCG";
+ $s_seq = "PATCG";
+
+ Maasha::AlignTwoSeq::insert_indels( $matches, \$q_seq, \$s_seq );
+
+ is( $q_seq, "-ATCG" );
+ is( $s_seq, "PATCG" );
+
+ $matches = [
+ { Q_BEG => 1, S_BEG => 0, Q_END => 4, S_END => 3 }
+ ];
+
+ $q_seq = "XATCG";
+ $s_seq = "ATCG";
+
+ Maasha::AlignTwoSeq::insert_indels( $matches, \$q_seq, \$s_seq );
+
+ is( $q_seq, "XATCG" );
+ is( $s_seq, "-ATCG" );
+
+ $matches = [
+ { Q_BEG => 0, S_BEG => 0, Q_END => 3, S_END => 3 },
+ { Q_BEG => 6, S_BEG => 6, Q_END => 9, S_END => 9 },
+ ];
+
+ $q_seq = "ATCGXXATCG";
+ $s_seq = "ATCGNNATCG";
+
+ Maasha::AlignTwoSeq::insert_indels( $matches, \$q_seq, \$s_seq );
+
+ is( $q_seq, "ATCGXXATCG" );
+ is( $s_seq, "ATCGNNATCG" );
+
+ $matches = [
+ { Q_BEG => 0, S_BEG => 0, Q_END => 3, S_END => 3 },
+ { Q_BEG => 6, S_BEG => 4, Q_END => 9, S_END => 7 },
+ ];
+
+ $q_seq = "ATCGXXATCG";
+ $s_seq = "ATCGATCG";
+
+ Maasha::AlignTwoSeq::insert_indels( $matches, \$q_seq, \$s_seq );
+
+ is( $q_seq, "ATCGXXATCG" );
+ is( $s_seq, "ATCG--ATCG" );
+
+ $matches = [
+ { Q_BEG => 0, S_BEG => 1, Q_END => 2, S_END => 3 },
+ { Q_BEG => 5, S_BEG => 4, Q_END => 8, S_END => 7 },
+ ];
+
+ $q_seq = "TCGXXATCG";
+ $s_seq = "ATCGATCG";
+
+ Maasha::AlignTwoSeq::insert_indels( $matches, \$q_seq, \$s_seq );
+
+ is( $q_seq, "-TCGXXATCG" );
+ is( $s_seq, "ATCG--ATCG" );
+
+ $matches = [
+ { Q_BEG => 1, S_BEG => 0, Q_END => 3, S_END => 2 },
+ { Q_BEG => 6, S_BEG => 3, Q_END => 9, S_END => 6 },
+ ];
+
+ $q_seq = "ATCGXXATCG";
+ $s_seq = "TCGATCG";
+
+ Maasha::AlignTwoSeq::insert_indels( $matches, \$q_seq, \$s_seq );
+
+ is( $q_seq, "ATCGXXATCG" );
+ is( $s_seq, "-TCG--ATCG" );
+
+ $matches = [
+ { Q_BEG => 1, Q_END => 3, S_BEG => 0, S_END => 2 },
+ { Q_BEG => 6, Q_END => 7, S_BEG => 3, S_END => 4 },
+ { Q_BEG => 9, Q_END => 10, S_BEG => 7, S_END => 8 },
+ ];
+
+ $q_seq = "ATCGXXATACG";
+ $s_seq = "TCGATNTCG";
+
+ Maasha::AlignTwoSeq::insert_indels( $matches, \$q_seq, \$s_seq );
+
+ is( $q_seq, "ATCGXXAT-ACG" );
+ # ||| || ||
+ is( $s_seq, "-TCG--ATNTCG" );
+
+ $matches = [
+ { Q_BEG => 1, Q_END => 3, S_BEG => 0, S_END => 2 },
+ { Q_BEG => 6, Q_END => 7, S_BEG => 3, S_END => 4 },
+ { Q_BEG => 9, Q_END => 10, S_BEG => 7, S_END => 8 },
+ ];
+
+ $q_seq = "ATCGXXATACG";
+ $s_seq = "TCGATNTCGXX";
+
+ Maasha::AlignTwoSeq::insert_indels( $matches, \$q_seq, \$s_seq );
+
+ is( $q_seq, "ATCGXXAT-ACG--" );
+ # ||| || ||
+ is( $s_seq, "-TCG--ATNTCGXX" );
+}