X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=code_ruby%2Flib%2Fmaasha%2Falign%2Fpair.rb;h=8bcbf25bddad3a8143e44d43a6c3a20c1e6790af;hb=3ce24a349c10a76d3e837e08b81259204ab870fe;hp=45e9298c495257c6f519321d00298abec0836262;hpb=1d54595c1543846e648795c6fbea809e6806057c;p=biopieces.git diff --git a/code_ruby/lib/maasha/align/pair.rb b/code_ruby/lib/maasha/align/pair.rb index 45e9298..8bcbf25 100644 --- a/code_ruby/lib/maasha/align/pair.rb +++ b/code_ruby/lib/maasha/align/pair.rb @@ -22,22 +22,12 @@ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# Extending Math module with a couple of useful methods. -module Math - # Method for calculating the distance between a point and a line. - def self.dist_point2line(px, py, x1, y1, x2, y2) - a = (y2.to_f - y1) / (x2.to_f - x1) +require 'maasha/align/matches' +require 'maasha/math_aux' - b = y1 - a * x1 - - (a * px + b - py ).abs / Math.sqrt(a ** 2 + 1 ) - end - - # Method for calculating the distance between two points. - def self.dist_point2point(x1, y1, x2, y2) - Math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2) - end -end +FACTOR_SCORE_LENGTH = 1.0 +FACTOR_SCORE_DIAG = -1.41 +KMER = 32 # Module with stuff to create a pairwise aligment. module PairAlign @@ -57,7 +47,9 @@ module PairAlign @q_entry.seq.downcase! @s_entry.seq.downcase! - align_recurse(@q_entry.seq, @s_entry.seq, 0, 0, @q_entry.length - 1, @s_entry.length - 1, 16, []) + space = Space.new(0, 0, @q_entry.length - 1, @s_entry.length - 1) + + align_recurse(@q_entry.seq, @s_entry.seq, space, KMER) matches_upcase gaps_insert end @@ -71,189 +63,102 @@ module PairAlign # depending on a calculated score. New search spaces spanning the spaces # between the best scoring matches and the search space boundaries will be # cast and recursed into. - def align_recurse(q_seq, s_seq, q_min, s_min, q_max, s_max, kmer, matches) - matches = matches_select_by_space(matches, q_min, s_min, q_max, s_max) + def align_recurse(q_seq, s_seq, space, kmer, matches = []) + matches = matches_select_by_space(matches, space) + matches = matches_select_by_score(matches, space) while (matches.size == 0 and kmer > 0) - matches = matches_find(q_seq, s_seq, q_min, s_min, q_max, s_max, kmer) + matches = Matches.find(q_seq, s_seq, space.q_min, space.s_min, space.q_max, space.s_max, kmer) + #matches = Mem.find(q_seq, s_seq, kmer, space.q_min, space.s_min, space.q_max, space.s_max) + + if @matches.empty? + matches.sort_by! { |m| m.length } + else + matches = matches_select_by_score(matches, space) + end + kmer /= 2 end - matches = matches_select_by_score(matches, q_min, s_min, q_max, s_max) - if best_match = matches.pop @matches << best_match - l_q_min = q_min - l_s_min = s_min - l_q_max = best_match.q_beg - 1 - l_s_max = best_match.s_beg - 1 - - r_q_min = best_match.q_end + 1 - r_s_min = best_match.s_end + 1 - r_q_max = q_max - r_s_max = s_max - - if l_q_max - l_q_min > 0 and l_s_max - l_s_min > 0 - align_recurse(q_seq, s_seq, l_q_min, l_s_min, l_q_max, l_s_max, kmer, matches) - end + space_left = Space.new(space.q_min, space.s_min, best_match.q_beg - 1, best_match.s_beg - 1) + space_right = Space.new(best_match.q_end + 1, best_match.s_end + 1, space.q_max, space.s_max) - if r_q_max - r_q_min > 0 and r_s_max - r_s_min > 0 - align_recurse(q_seq, s_seq, r_q_min, r_s_min, r_q_max, r_s_max, kmer, matches) - end + align_recurse(q_seq, s_seq, space_left, kmer, matches) unless space_left.empty? + align_recurse(q_seq, s_seq, space_right, kmer, matches) unless space_right.empty? end end # Method to select matches that lies within the search space. - def matches_select_by_space(matches, q_min, s_min, q_max, s_max) + def matches_select_by_space(matches, space) new_matches = matches.select do |match| - match.q_beg >= q_min and - match.s_beg >= s_min and - match.q_end <= q_max and - match.s_end <= s_max + match.q_beg >= space.q_min and + match.s_beg >= space.s_min and + match.q_end <= space.q_max and + match.s_end <= space.s_max end new_matches end - # Method to select the best scoring matches and return these sorted - # according to score. - def matches_select_by_score(matches, q_min, s_min, q_max, s_max) - new_matches = [] - - matches.each do |match| - score_length = match_score_by_length(match) - score_diag = match_score_by_diagonal_dist(match, q_min, s_min, q_max, s_max) - - match.score = score_length - score_diag - - new_matches << match if match.score > 0 - end - - new_matches.sort_by! { |match| match.score } - - new_matches - end - - # Method to calculate score based on match length. - def match_score_by_length(match) - match.length.to_f - end - - # Method to calculate score based on the distance to the closest - # diagonal. The smaller the distance the better the score. - def match_score_by_diagonal_dist(match, q_min, s_min, q_max, s_max) - q_dim = q_max - q_min + 1 - s_dim = s_max - s_min + 1 - - if q_dim >= s_dim # s_dim is the narrow end - beg_dist = Math.dist_point2line(match.q_beg, match.s_beg, q_min, s_min, q_min + s_dim, s_min + s_dim) - end_dist = Math.dist_point2line(match.q_beg, match.s_beg, q_max - s_dim, s_max - s_dim, q_max, s_max) - else - beg_dist = Math.dist_point2line(match.q_beg, match.s_beg, q_min, s_min, q_min + q_dim, s_min + q_dim) - end_dist = Math.dist_point2line(match.q_beg, match.s_beg, q_max - q_dim, s_max - q_dim, q_max, s_max) - end - - min_dist = (beg_dist < end_dist) ? beg_dist : end_dist - min_dist.to_f - end - - # Method that finds all maximally expanded non-redundant matches shared - # between two sequences inside a given search space. - def matches_find(q_seq, s_seq, q_min, s_min, q_max, s_max, kmer) - matches = [] - redundant = Hash.new { |h, k| h[k] = [] } - - s_index = index_seq(s_seq, s_min, s_max, kmer) - - q_pos = q_min - - while q_pos <= q_max - kmer + 1 - q_oligo = q_seq[q_pos ... q_pos + kmer] - - s_index[q_oligo].each do |s_pos| - match = Match.new(q_pos, s_pos, kmer) - - unless match_redundant?(redundant, match) - match_expand(match, q_seq, s_seq, q_min, s_min, q_max, s_max) - matches << match - - match_redundant_add(redundant, match) - end - end - - q_pos += 1 - end + # Method to select matches based on score. + def matches_select_by_score(matches, space) + matches_score(matches, space) - matches + matches.select { |match| match.score > 0 } end - # Method that indexes a seuquence within a given interval such that the - # index contains all oligos of a given kmer size and the positions where - # this oligo was located. - def index_seq(seq, min, max, kmer) - index_hash = Hash.new { |h, k| h[k] = [] } - - pos = min - - while pos <= max - kmer + 1 - oligo = seq[pos ... pos + kmer] - index_hash[oligo] << pos - - pos += 1 - end - - index_hash - end + def matches_score(matches, space) + matches.each do |match| + score_length = match_score_length(match) + score_diag = match_score_diag(match, space) - # Method to check if a match is redundant. - def match_redundant?(redundant, match) - redundant[match.q_beg].each do |s_interval| - if s_interval.include? match.s_beg and s_interval.include? match.s_end - return true - end + match.score = score_length + score_diag end - false + matches.sort_by! { |match| match.score } end - # Method that adds a match to the redundancy index. - def match_redundant_add(redundant, match) - (match.q_beg .. match.q_end).each do |q| - redundant[q] << (match.s_beg .. match.s_end) - end + def match_score_length(match) + match.length * FACTOR_SCORE_LENGTH end - # Method that expands a match as far as possible to the left and right. - def match_expand(match, q_seq, s_seq, q_min, s_min, q_max, s_max) - match_expand_left(match, q_seq, s_seq, q_min, s_min) - match_expand_right(match, q_seq, s_seq, q_max, s_max) - - match - end + def match_score_diag(match, space) + if space.q_dim > space.s_dim # s_dim is the narrow end + dist_beg = Math.dist_point2line(match.q_beg, + match.s_beg, + space.q_min, + space.s_min, + space.q_min + space.s_dim, + space.s_min + space.s_dim) - # Method that expands a match as far as possible to the left. - def match_expand_left(match, q_seq, s_seq, q_min, s_min) - while match.q_beg > q_min and - match.s_beg > s_min and - q_seq[match.q_beg - 1] == s_seq[match.s_beg - 1] - match.q_beg -= 1 - match.s_beg -= 1 - match.length += 1 + dist_end = Math.dist_point2line( match.q_beg, + match.s_beg, + space.q_max - space.s_dim, + space.s_max - space.s_dim, + space.q_max, + space.s_max) + else + dist_beg = Math.dist_point2line( match.q_beg, + match.s_beg, + space.q_min, + space.s_min, + space.q_min + space.q_dim, + space.s_min + space.q_dim) + + dist_end = Math.dist_point2line( match.q_beg, + match.s_beg, + space.q_max - space.q_dim, + space.s_max - space.q_dim, + space.q_max, + space.s_max) end - match - end + dist_min = dist_beg < dist_end ? dist_beg : dist_end - # Method that expands a match as far as possible to the right. - def match_expand_right(match, q_seq, s_seq, q_max, s_max) - while match.q_end < q_max and - match.s_end < s_max and - q_seq[match.q_end + 1] == s_seq[match.s_end + 1] - match.length += 1 - end - - match + dist_min * FACTOR_SCORE_DIAG end # Method for debugging purposes that upcase matching sequence while non-matches @@ -267,23 +172,31 @@ module PairAlign # Method that insert gaps in sequences based on a list of matches and thus # creating an alignment. - # TODO check boundaries! def gaps_insert @matches.sort_by! { |m| m.q_beg } q_gaps = 0 s_gaps = 0 - @matches.each do |match| - diff = (q_gaps + match.q_beg) - (s_gaps + match.s_beg) + match = @matches.first + diff = (q_gaps + match.q_beg) - (s_gaps + match.s_beg) - #pp "q_gaps #{q_gaps} s_gaps #{s_gaps} diff #{diff}" + if diff < 0 + @q_entry.seq.insert(0, "-" * diff.abs) + q_gaps += diff.abs + elsif diff > 0 + @s_entry.seq.insert(0, "-" * diff.abs) + s_gaps += diff.abs + end + + @matches[1 .. -1].each do |m| + diff = (q_gaps + m.q_beg) - (s_gaps + m.s_beg) if diff < 0 - @q_entry.seq.insert(match.q_beg + q_gaps, "-" * diff.abs) + @q_entry.seq.insert(m.q_beg + q_gaps, "-" * diff.abs) q_gaps += diff.abs elsif diff > 0 - @s_entry.seq.insert(match.s_beg + s_gaps, "-" * diff.abs) + @s_entry.seq.insert(m.s_beg + s_gaps, "-" * diff.abs) s_gaps += diff.abs end end @@ -298,29 +211,31 @@ module PairAlign end end - # Class for containing a match between two sequences q and s. - class Match - attr_accessor :q_beg, :s_beg, :length, :score + # Class for containing a search space between two sequences q and s. + class Space + attr_reader :q_min, :s_min, :q_max, :s_max - def initialize(q_beg, s_beg, length, score = 0.0) - @q_beg = q_beg - @s_beg = s_beg - @length = length - @score = score + def initialize(q_min, s_min, q_max, s_max) + @q_min = q_min + @s_min = s_min + @q_max = q_max + @s_max = s_max end - def q_end - @q_beg + @length - 1 + def q_dim + @q_max - @q_min + 1 end - def s_end - @s_beg + @length - 1 + def s_dim + @s_max - @s_min + 1 end - def to_s(seq = nil) - s = "q: #{@q_beg} #{q_end} s: #{@s_beg} #{s_end} l: #{@length} s: #{@score}" - s << " seq: #{seq[@q_beg .. q_end]}" if seq - s + def empty? + if @q_max - @q_min >= 0 and @s_max - @s_min >= 0 + return false + end + + true end end end