From: martinahansen Date: Thu, 28 Jun 2012 11:29:22 +0000 (+0000) Subject: cleanup of pair stuff in align.rb X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=6f8f52dbf050dab0a0a4666a9e41a82d4e30c63b;p=biopieces.git cleanup of pair stuff in align.rb git-svn-id: http://biopieces.googlecode.com/svn/trunk@1849 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/code_ruby/lib/maasha/align.rb b/code_ruby/lib/maasha/align.rb index bb83d9e..a828721 100755 --- a/code_ruby/lib/maasha/align.rb +++ b/code_ruby/lib/maasha/align.rb @@ -143,7 +143,7 @@ class Align ap = AlignPair.new(q_entry.seq, s_entry.seq) - ap.align(q_space_beg, q_space_end, s_space_beg, s_space_end, 256) + ap.align(q_space_beg, q_space_end, s_space_beg, s_space_end, 15, []) ap.upcase_match ap.insert_gaps @@ -248,19 +248,26 @@ class Align # within the specified search space. Finally, the longest match is used # to define a left and a right search space which are recursed into # until no more matches are found. - def align(q_space_beg, q_space_end, s_space_beg, s_space_end, length) + def align(q_space_beg, q_space_end, s_space_beg, s_space_end, kmer_size, matches) # puts "search space: #{q_space_beg}-#{q_space_end} x #{s_space_beg}-#{s_space_end}" new_matches = [] - while new_matches.empty? and length > 0 - if @q_seq.length >= length and @s_seq.length >= length - q_index = index_seq(q_space_beg, q_space_end, @q_seq, length, length) - s_index = index_seq(s_space_beg, s_space_end, @s_seq, length, 1) - new_matches = find_matches(q_index, s_index, length) + matches.each do |match| + new_matches << match if match.q_beg >= q_space_beg and + match.q_end <= q_space_end and + match.s_beg >= s_space_beg and + match.s_end <= s_space_end + end + + while new_matches.empty? and kmer_size > 0 + if @q_seq.length >= kmer_size and @s_seq.length >= kmer_size + q_index = index_seq(q_space_beg, q_space_end, @q_seq, kmer_size, kmer_size) + s_index = index_seq(s_space_beg, s_space_end, @s_seq, kmer_size, 1) + new_matches = find_matches(q_index, s_index, kmer_size) end unless new_matches.empty? - longest_match = new_matches.sort_by { |match| match.length }.last + longest_match = new_matches.sort_by { |match| match.length }.pop longest_match.expand(@q_seq, @s_seq, q_space_beg, q_space_end, s_space_beg, s_space_end) # pp longest_match @@ -277,15 +284,15 @@ class Align s_space_end_right = (s_space_end == @s_seq.length) ? @s_seq.length : s_space_end - 1 if q_space_end_left - q_space_beg_left > 0 and s_space_end_left - s_space_beg_left > 0 - align(q_space_beg_left, q_space_end_left, s_space_beg_left, s_space_end_left, length) + align(q_space_beg_left, q_space_end_left, s_space_beg_left, s_space_end_left, kmer_size, new_matches) end if q_space_end_right - q_space_beg_right > 0 and s_space_end_right - s_space_beg_right > 0 - align(q_space_beg_right, q_space_end_right, s_space_beg_right, s_space_end_right, length) + align(q_space_beg_right, q_space_end_right, s_space_beg_right, s_space_end_right, kmer_size, new_matches) end end - length /= 2 + kmer_size /= 2 end end @@ -351,17 +358,18 @@ class Align # and step size. All oligo positions are saved in a lookup hash # where the oligo is the key and the position is the value. Non- # unique oligos are removed and the index returned. - def index_seq(space_beg, space_end, seq, oligo_size, step_size) + def index_seq(space_beg, space_end, seq, kmer_size, step_size) index = Hash.new index_count = Hash.new(0) - (space_beg ... space_end).step(step_size).each do |pos| - oligo = seq[pos ... pos + oligo_size] + i = space_beg - if oligo.length == oligo_size - index[oligo] = pos - index_count[oligo] += 1 - end + while i < space_end - kmer_size + 1 + oligo = seq[i ... i + kmer_size] + + index[oligo] = i + index_count[oligo] += 1 + i += step_size end index_count.each do |oligo, count| @@ -375,21 +383,20 @@ class Align # using non-overlapping unique oligos and their positions and one is # using overlapping unique oligos and positions. Thus iterating over the # non-overlapping oligos and checking last match, this can be extended. - def find_matches(q_index, s_index, length) + def find_matches(q_index, s_index, kmer_size) matches = [] q_index.each do |oligo, q_pos| - if s_index[oligo] - s_pos = s_index[oligo] + if s_pos = s_index[oligo] last_match = matches.last - new_match = Match.new(q_pos, s_pos, length) + new_match = Match.new(q_pos, s_pos, kmer_size) if last_match and last_match.q_beg + last_match.length == new_match.q_beg and last_match.s_beg + last_match.length == new_match.s_beg - last_match.length += length + last_match.length += kmer_size else matches << new_match end