class AlignError < StandardError; end;
-ALPH_DNA = %w{A T C G}
-ALPH_AMBI = %w{A T C G M R W S Y K V H D B N}
+ALPH_DNA = %w{A T C G}
+ALPH_AMBI = %w{A T C G M R W S Y K V H D B N}
BIT_INDEL = 0
# BIT_A = 1 << 0
self.new(result)
end
+
+ # Class method to create a pairwise alignment of two given Seq objects. The
+ # alignment is created by casting a search space the size of the sequences
+ # and save the longest perfect match between the sequences and recurse into
+ # the left and right search spaced around this match. When all search spaces
+ # are exhausted the saved matches are used to insert gaps in the sequences.
+ def self.pair(q_entry, s_entry)
+ q_space_beg = 0
+ q_space_end = q_entry.length
+ s_space_beg = 0
+ s_space_end = s_entry.length
+
+ q_entry.seq.downcase!
+ s_entry.seq.downcase!
+
+ ap = AlignPair.new(q_entry.seq, s_entry.seq)
+
+ ap.align(q_space_beg, q_space_end, s_space_beg, s_space_end, 256)
+
+ ap.upcase_match
+ ap.insert_gaps
+
+ #ap.dump_bp and exit
+
+ self.new([q_entry, s_entry])
+ end
# Method to initialize an Align object with a list of aligned Seq objects.
def initialize(entries, options = {})
private
+ # Class for creating a pairwise alignment of two specified sequences. The
+ # alignment is created by casting a search space the size of the sequences
+ # and save the longest perfect match between the sequences and recurse into
+ # the left and right search spaced around this match. When all search spaces
+ # are exhausted the saved matches are used to insert gaps in the sequences.
+ class AlignPair
+ attr_reader :matches
+
+ # Method to initialize an AlignPair object given two sequence strings.
+ def initialize(q_seq, s_seq)
+ @q_seq = q_seq
+ @s_seq = s_seq
+ @matches = []
+ end
+
+ # Method that locates the longest perfect match in a specified search space
+ # by comparing two sequence indeces. The first index is created by hashing
+ # for the first sequence the position of all non-overlapping unique oligos
+ # with the oligo as key an the position as value. The second index is
+ # created for the second sequence by hashing the position of all
+ # overlapping unique oligos in a similar way. The longest match is located
+ # by comparing these indeces and the longest match is expanded maximally
+ # within the specified search space. Finally, the longest match is used
+ # to define a left and a right search space which are recursed into
+ # until no more matches are found.
+ def align(q_space_beg, q_space_end, s_space_beg, s_space_end, length)
+# puts "search space: #{q_space_beg}-#{q_space_end} x #{s_space_beg}-#{s_space_end}"
+ new_matches = []
+
+ while new_matches.empty? and length > 0
+ if @q_seq.length >= length and @s_seq.length >= length
+ q_index = index_seq(q_space_beg, q_space_end, @q_seq, length, length)
+ s_index = index_seq(s_space_beg, s_space_end, @s_seq, length, 1)
+ new_matches = find_matches(q_index, s_index, length)
+ end
+
+ unless new_matches.empty?
+ longest_match = new_matches.sort_by { |match| match.length }.last
+ longest_match.expand(@q_seq, @s_seq, q_space_beg, q_space_end, s_space_beg, s_space_end)
+# pp longest_match
+
+ @matches << longest_match
+
+ q_space_beg_left = (q_space_beg == 0) ? 0 : q_space_beg + 1
+ s_space_beg_left = (s_space_beg == 0) ? 0 : s_space_beg + 1
+ q_space_end_left = longest_match.q_beg - 1
+ s_space_end_left = longest_match.s_beg - 1
+
+ q_space_beg_right = longest_match.q_end + 1
+ s_space_beg_right = longest_match.s_end + 1
+ q_space_end_right = (q_space_end == @q_seq.length) ? @q_seq.length : q_space_end - 1
+ s_space_end_right = (s_space_end == @s_seq.length) ? @s_seq.length : s_space_end - 1
+
+ if q_space_end_left - q_space_beg_left > 0 and s_space_end_left - s_space_beg_left > 0
+ align(q_space_beg_left, q_space_end_left, s_space_beg_left, s_space_end_left, length)
+ end
+
+ if q_space_end_right - q_space_beg_right > 0 and s_space_end_right - s_space_beg_right > 0
+ align(q_space_beg_right, q_space_end_right, s_space_beg_right, s_space_end_right, length)
+ end
+ end
+
+ length /= 2
+ end
+ end
+
+ # Method for debugging purposes that upcase matching sequence while non-matches
+ # sequence is kept in lower case.
+ def upcase_match
+ @matches.each do |match|
+ @q_seq[match.q_beg .. match.q_end] = @q_seq[match.q_beg .. match.q_end].upcase
+ @s_seq[match.s_beg .. match.s_end] = @s_seq[match.s_beg .. match.s_end].upcase
+ end
+ end
+
+ # Method that insert gaps in sequences based on a list of matches and thus
+ # creating an alignment.
+ # TODO check boundaries!
+ def insert_gaps
+ @matches.sort_by! { |m| m.q_beg }
+
+ q_gaps = 0
+ s_gaps = 0
+
+ @matches.each do |match|
+ diff = (q_gaps + match.q_beg) - (s_gaps + match.s_beg)
+
+ #pp "q_gaps #{q_gaps} s_gaps #{s_gaps} diff #{diff}"
+
+ if diff < 0
+ @q_seq.insert(match.q_beg + q_gaps, "-" * diff.abs)
+ q_gaps += diff.abs
+ elsif diff > 0
+ @s_seq.insert(match.s_beg + s_gaps, "-" * diff.abs)
+ s_gaps += diff.abs
+ end
+ end
+
+ diff = @q_seq.length - @s_seq.length
+
+ if diff < 0
+ @q_seq << ("-" * diff.abs)
+ else
+ @s_seq << ("-" * diff.abs)
+ end
+ end
+
+ # Method that dumps all matches as Biopiece records for use with
+ # plot_matches.
+ def dump_bp
+ @matches.sort_by! { |m| m.q_beg }
+
+ @matches.each { |m|
+ puts "Q_BEG: #{m.q_beg}"
+ puts "Q_END: #{m.q_end}"
+ puts "S_BEG: #{m.s_beg}"
+ puts "S_END: #{m.s_end}"
+ puts "STRAND: +"
+ puts "---"
+ }
+ end
+
+ private
+
+ # Method for indexing a sequence given a search space, oligo size
+ # and step size. All oligo positions are saved in a lookup hash
+ # where the oligo is the key and the position is the value. Non-
+ # unique oligos are removed and the index returned.
+ def index_seq(space_beg, space_end, seq, oligo_size, step_size)
+ index = Hash.new
+ index_count = Hash.new(0)
+
+ (space_beg ... space_end).step(step_size).each do |pos|
+ oligo = seq[pos ... pos + oligo_size]
+
+ if oligo.length == oligo_size
+ index[oligo] = pos
+ index_count[oligo] += 1
+ end
+ end
+
+ index_count.each do |oligo, count|
+ index.delete(oligo) if count > 1
+ end
+
+ index
+ end
+
+ # Method for locating matches between two indexes where one is created
+ # using non-overlapping unique oligos and their positions and one is
+ # using overlapping unique oligos and positions. Thus iterating over the
+ # non-overlapping oligos and checking last match, this can be extended.
+ def find_matches(q_index, s_index, length)
+ matches = []
+
+ q_index.each do |oligo, q_pos|
+ if s_index[oligo]
+ s_pos = s_index[oligo]
+
+ last_match = matches.last
+ new_match = Match.new(q_pos, s_pos, length)
+
+ if last_match and
+ last_match.q_beg + last_match.length == new_match.q_beg and
+ last_match.s_beg + last_match.length == new_match.s_beg
+
+ last_match.length += length
+ else
+ matches << new_match
+ end
+ end
+ end
+
+ matches
+ end
+
+ # Class for Match objects.
+ class Match
+ attr_accessor :length
+ attr_reader :q_beg, :s_beg
+
+ def initialize(q_beg, s_beg, length)
+ @q_beg = q_beg
+ @s_beg = s_beg
+ @length = length
+ end
+
+ def q_end
+ @q_beg + @length - 1
+ end
+
+ def s_end
+ @s_beg + @length - 1
+ end
+
+ def to_s
+ "q_beg=#{q_beg} q_end=#{q_end} s_beg=#{s_beg} s_end=#{s_end} length=#{length}"
+ end
+
+ # Method to expand a match as far as possible to the left and right
+ # within a given search space.
+ def expand(q_seq, s_seq, q_space_beg, q_space_end, s_space_beg, s_space_end)
+ expand_left(q_seq, s_seq, q_space_beg, s_space_beg)
+ expand_right(q_seq, s_seq, q_space_end, s_space_end)
+ end
+
+ private
+
+ # Method to expand a match as far as possible to the left within a given
+ # search space.
+ def expand_left(q_seq, s_seq, q_space_beg, s_space_beg)
+ while @q_beg > q_space_beg and @s_beg > s_space_beg and q_seq[@q_beg - 1] == s_seq[@s_beg - 1]
+ @q_beg -= 1
+ @s_beg -= 1
+ @length += 1
+ end
+ end
+
+ # Method to expand a match as far as possible to the right within a given
+ # search space.
+ def expand_right(q_seq, s_seq, q_space_end, s_space_end)
+ while q_end + 1 <= q_space_end and s_end + 1 <= s_space_end and q_seq[q_end + 1] == s_seq[s_end + 1]
+ @length += 1
+ end
+ end
+ end
+ end
+
class Consensus
# Method to initialize a Consensus object given a list of aligned Seq object.
def initialize(entries, options)