ap = AlignPair.new(q_entry.seq, s_entry.seq)
- ap.align(q_space_beg, q_space_end, s_space_beg, s_space_end, 256)
+ ap.align(q_space_beg, q_space_end, s_space_beg, s_space_end, 15, [])
ap.upcase_match
ap.insert_gaps
# within the specified search space. Finally, the longest match is used
# to define a left and a right search space which are recursed into
# until no more matches are found.
- def align(q_space_beg, q_space_end, s_space_beg, s_space_end, length)
+ def align(q_space_beg, q_space_end, s_space_beg, s_space_end, kmer_size, matches)
# puts "search space: #{q_space_beg}-#{q_space_end} x #{s_space_beg}-#{s_space_end}"
new_matches = []
- while new_matches.empty? and length > 0
- if @q_seq.length >= length and @s_seq.length >= length
- q_index = index_seq(q_space_beg, q_space_end, @q_seq, length, length)
- s_index = index_seq(s_space_beg, s_space_end, @s_seq, length, 1)
- new_matches = find_matches(q_index, s_index, length)
+ matches.each do |match|
+ new_matches << match if match.q_beg >= q_space_beg and
+ match.q_end <= q_space_end and
+ match.s_beg >= s_space_beg and
+ match.s_end <= s_space_end
+ end
+
+ while new_matches.empty? and kmer_size > 0
+ if @q_seq.length >= kmer_size and @s_seq.length >= kmer_size
+ q_index = index_seq(q_space_beg, q_space_end, @q_seq, kmer_size, kmer_size)
+ s_index = index_seq(s_space_beg, s_space_end, @s_seq, kmer_size, 1)
+ new_matches = find_matches(q_index, s_index, kmer_size)
end
unless new_matches.empty?
- longest_match = new_matches.sort_by { |match| match.length }.last
+ longest_match = new_matches.sort_by { |match| match.length }.pop
longest_match.expand(@q_seq, @s_seq, q_space_beg, q_space_end, s_space_beg, s_space_end)
# pp longest_match
s_space_end_right = (s_space_end == @s_seq.length) ? @s_seq.length : s_space_end - 1
if q_space_end_left - q_space_beg_left > 0 and s_space_end_left - s_space_beg_left > 0
- align(q_space_beg_left, q_space_end_left, s_space_beg_left, s_space_end_left, length)
+ align(q_space_beg_left, q_space_end_left, s_space_beg_left, s_space_end_left, kmer_size, new_matches)
end
if q_space_end_right - q_space_beg_right > 0 and s_space_end_right - s_space_beg_right > 0
- align(q_space_beg_right, q_space_end_right, s_space_beg_right, s_space_end_right, length)
+ align(q_space_beg_right, q_space_end_right, s_space_beg_right, s_space_end_right, kmer_size, new_matches)
end
end
- length /= 2
+ kmer_size /= 2
end
end
# and step size. All oligo positions are saved in a lookup hash
# where the oligo is the key and the position is the value. Non-
# unique oligos are removed and the index returned.
- def index_seq(space_beg, space_end, seq, oligo_size, step_size)
+ def index_seq(space_beg, space_end, seq, kmer_size, step_size)
index = Hash.new
index_count = Hash.new(0)
- (space_beg ... space_end).step(step_size).each do |pos|
- oligo = seq[pos ... pos + oligo_size]
+ i = space_beg
- if oligo.length == oligo_size
- index[oligo] = pos
- index_count[oligo] += 1
- end
+ while i < space_end - kmer_size + 1
+ oligo = seq[i ... i + kmer_size]
+
+ index[oligo] = i
+ index_count[oligo] += 1
+ i += step_size
end
index_count.each do |oligo, count|
# using non-overlapping unique oligos and their positions and one is
# using overlapping unique oligos and positions. Thus iterating over the
# non-overlapping oligos and checking last match, this can be extended.
- def find_matches(q_index, s_index, length)
+ def find_matches(q_index, s_index, kmer_size)
matches = []
q_index.each do |oligo, q_pos|
- if s_index[oligo]
- s_pos = s_index[oligo]
+ if s_pos = s_index[oligo]
last_match = matches.last
- new_match = Match.new(q_pos, s_pos, length)
+ new_match = Match.new(q_pos, s_pos, kmer_size)
if last_match and
last_match.q_beg + last_match.length == new_match.q_beg and
last_match.s_beg + last_match.length == new_match.s_beg
- last_match.length += length
+ last_match.length += kmer_size
else
matches << new_match
end