From: martinahansen Date: Mon, 23 Apr 2012 11:15:00 +0000 (+0000) Subject: workied on pair wise aligner X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=6fcdc25f807208008caff07608be15671809aa42;p=biopieces.git workied on pair wise aligner git-svn-id: http://biopieces.googlecode.com/svn/trunk@1807 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/code_ruby/lib/maasha/align.rb b/code_ruby/lib/maasha/align.rb index d3e5b44..bb83d9e 100755 --- a/code_ruby/lib/maasha/align.rb +++ b/code_ruby/lib/maasha/align.rb @@ -29,8 +29,8 @@ require 'maasha/fasta' class AlignError < StandardError; end; -ALPH_DNA = %w{A T C G} -ALPH_AMBI = %w{A T C G M R W S Y K V H D B N} +ALPH_DNA = %w{A T C G} +ALPH_AMBI = %w{A T C G M R W S Y K V H D B N} BIT_INDEL = 0 # BIT_A = 1 << 0 @@ -126,6 +126,32 @@ class Align self.new(result) end + + # Class method to create a pairwise alignment of two given Seq objects. The + # alignment is created by casting a search space the size of the sequences + # and save the longest perfect match between the sequences and recurse into + # the left and right search spaced around this match. When all search spaces + # are exhausted the saved matches are used to insert gaps in the sequences. + def self.pair(q_entry, s_entry) + q_space_beg = 0 + q_space_end = q_entry.length + s_space_beg = 0 + s_space_end = s_entry.length + + q_entry.seq.downcase! + s_entry.seq.downcase! + + ap = AlignPair.new(q_entry.seq, s_entry.seq) + + ap.align(q_space_beg, q_space_end, s_space_beg, s_space_end, 256) + + ap.upcase_match + ap.insert_gaps + + #ap.dump_bp and exit + + self.new([q_entry, s_entry]) + end # Method to initialize an Align object with a list of aligned Seq objects. def initialize(entries, options = {}) @@ -197,6 +223,234 @@ class Align private + # Class for creating a pairwise alignment of two specified sequences. The + # alignment is created by casting a search space the size of the sequences + # and save the longest perfect match between the sequences and recurse into + # the left and right search spaced around this match. When all search spaces + # are exhausted the saved matches are used to insert gaps in the sequences. + class AlignPair + attr_reader :matches + + # Method to initialize an AlignPair object given two sequence strings. + def initialize(q_seq, s_seq) + @q_seq = q_seq + @s_seq = s_seq + @matches = [] + end + + # Method that locates the longest perfect match in a specified search space + # by comparing two sequence indeces. The first index is created by hashing + # for the first sequence the position of all non-overlapping unique oligos + # with the oligo as key an the position as value. The second index is + # created for the second sequence by hashing the position of all + # overlapping unique oligos in a similar way. The longest match is located + # by comparing these indeces and the longest match is expanded maximally + # within the specified search space. Finally, the longest match is used + # to define a left and a right search space which are recursed into + # until no more matches are found. + def align(q_space_beg, q_space_end, s_space_beg, s_space_end, length) +# puts "search space: #{q_space_beg}-#{q_space_end} x #{s_space_beg}-#{s_space_end}" + new_matches = [] + + while new_matches.empty? and length > 0 + if @q_seq.length >= length and @s_seq.length >= length + q_index = index_seq(q_space_beg, q_space_end, @q_seq, length, length) + s_index = index_seq(s_space_beg, s_space_end, @s_seq, length, 1) + new_matches = find_matches(q_index, s_index, length) + end + + unless new_matches.empty? + longest_match = new_matches.sort_by { |match| match.length }.last + longest_match.expand(@q_seq, @s_seq, q_space_beg, q_space_end, s_space_beg, s_space_end) +# pp longest_match + + @matches << longest_match + + q_space_beg_left = (q_space_beg == 0) ? 0 : q_space_beg + 1 + s_space_beg_left = (s_space_beg == 0) ? 0 : s_space_beg + 1 + q_space_end_left = longest_match.q_beg - 1 + s_space_end_left = longest_match.s_beg - 1 + + q_space_beg_right = longest_match.q_end + 1 + s_space_beg_right = longest_match.s_end + 1 + q_space_end_right = (q_space_end == @q_seq.length) ? @q_seq.length : q_space_end - 1 + s_space_end_right = (s_space_end == @s_seq.length) ? @s_seq.length : s_space_end - 1 + + if q_space_end_left - q_space_beg_left > 0 and s_space_end_left - s_space_beg_left > 0 + align(q_space_beg_left, q_space_end_left, s_space_beg_left, s_space_end_left, length) + end + + if q_space_end_right - q_space_beg_right > 0 and s_space_end_right - s_space_beg_right > 0 + align(q_space_beg_right, q_space_end_right, s_space_beg_right, s_space_end_right, length) + end + end + + length /= 2 + end + end + + # Method for debugging purposes that upcase matching sequence while non-matches + # sequence is kept in lower case. + def upcase_match + @matches.each do |match| + @q_seq[match.q_beg .. match.q_end] = @q_seq[match.q_beg .. match.q_end].upcase + @s_seq[match.s_beg .. match.s_end] = @s_seq[match.s_beg .. match.s_end].upcase + end + end + + # Method that insert gaps in sequences based on a list of matches and thus + # creating an alignment. + # TODO check boundaries! + def insert_gaps + @matches.sort_by! { |m| m.q_beg } + + q_gaps = 0 + s_gaps = 0 + + @matches.each do |match| + diff = (q_gaps + match.q_beg) - (s_gaps + match.s_beg) + + #pp "q_gaps #{q_gaps} s_gaps #{s_gaps} diff #{diff}" + + if diff < 0 + @q_seq.insert(match.q_beg + q_gaps, "-" * diff.abs) + q_gaps += diff.abs + elsif diff > 0 + @s_seq.insert(match.s_beg + s_gaps, "-" * diff.abs) + s_gaps += diff.abs + end + end + + diff = @q_seq.length - @s_seq.length + + if diff < 0 + @q_seq << ("-" * diff.abs) + else + @s_seq << ("-" * diff.abs) + end + end + + # Method that dumps all matches as Biopiece records for use with + # plot_matches. + def dump_bp + @matches.sort_by! { |m| m.q_beg } + + @matches.each { |m| + puts "Q_BEG: #{m.q_beg}" + puts "Q_END: #{m.q_end}" + puts "S_BEG: #{m.s_beg}" + puts "S_END: #{m.s_end}" + puts "STRAND: +" + puts "---" + } + end + + private + + # Method for indexing a sequence given a search space, oligo size + # and step size. All oligo positions are saved in a lookup hash + # where the oligo is the key and the position is the value. Non- + # unique oligos are removed and the index returned. + def index_seq(space_beg, space_end, seq, oligo_size, step_size) + index = Hash.new + index_count = Hash.new(0) + + (space_beg ... space_end).step(step_size).each do |pos| + oligo = seq[pos ... pos + oligo_size] + + if oligo.length == oligo_size + index[oligo] = pos + index_count[oligo] += 1 + end + end + + index_count.each do |oligo, count| + index.delete(oligo) if count > 1 + end + + index + end + + # Method for locating matches between two indexes where one is created + # using non-overlapping unique oligos and their positions and one is + # using overlapping unique oligos and positions. Thus iterating over the + # non-overlapping oligos and checking last match, this can be extended. + def find_matches(q_index, s_index, length) + matches = [] + + q_index.each do |oligo, q_pos| + if s_index[oligo] + s_pos = s_index[oligo] + + last_match = matches.last + new_match = Match.new(q_pos, s_pos, length) + + if last_match and + last_match.q_beg + last_match.length == new_match.q_beg and + last_match.s_beg + last_match.length == new_match.s_beg + + last_match.length += length + else + matches << new_match + end + end + end + + matches + end + + # Class for Match objects. + class Match + attr_accessor :length + attr_reader :q_beg, :s_beg + + def initialize(q_beg, s_beg, length) + @q_beg = q_beg + @s_beg = s_beg + @length = length + end + + def q_end + @q_beg + @length - 1 + end + + def s_end + @s_beg + @length - 1 + end + + def to_s + "q_beg=#{q_beg} q_end=#{q_end} s_beg=#{s_beg} s_end=#{s_end} length=#{length}" + end + + # Method to expand a match as far as possible to the left and right + # within a given search space. + def expand(q_seq, s_seq, q_space_beg, q_space_end, s_space_beg, s_space_end) + expand_left(q_seq, s_seq, q_space_beg, s_space_beg) + expand_right(q_seq, s_seq, q_space_end, s_space_end) + end + + private + + # Method to expand a match as far as possible to the left within a given + # search space. + def expand_left(q_seq, s_seq, q_space_beg, s_space_beg) + while @q_beg > q_space_beg and @s_beg > s_space_beg and q_seq[@q_beg - 1] == s_seq[@s_beg - 1] + @q_beg -= 1 + @s_beg -= 1 + @length += 1 + end + end + + # Method to expand a match as far as possible to the right within a given + # search space. + def expand_right(q_seq, s_seq, q_space_end, s_space_end) + while q_end + 1 <= q_space_end and s_end + 1 <= s_space_end and q_seq[q_end + 1] == s_seq[s_end + 1] + @length += 1 + end + end + end + end + class Consensus # Method to initialize a Consensus object given a list of aligned Seq object. def initialize(entries, options)