1 # Copyright (C) 2007-2012 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Module with stuff to create a pairwise aligment.
27 # Class for creating a pairwise alignment.
29 # Class method to create a pairwise alignment of two given Seq objects.
30 def self.align(q_entry, s_entry)
31 self.new(q_entry, s_entry)
34 # Method to inialize a pairwise alignment given two Seq objects.
35 def initialize(q_entry, s_entry)
40 @q_entry.seq.downcase!
41 @s_entry.seq.downcase!
43 align_recurse(@q_entry.seq, @s_entry.seq, 0, 0, @q_entry.length - 1, @s_entry.length - 1, 16, [])
50 # Method that creates an alignment by chaining matches, which are
51 # subsequences shared between two sequences. This recursive method
52 # functions by considering only matches within a given search space. If no
53 # matches are given these will be located and matches will be included
54 # depending on a calculated score. New search spaces spanning the spaces
55 # between the best scoring matches and the search space boundaries will be
56 # cast and recursed into.
57 def align_recurse(q_seq, s_seq, q_min, s_min, q_max, s_max, kmer, matches)
58 matches = matches_select_by_space(matches, q_min, s_min, q_max, s_max)
60 while (matches.size == 0 and kmer > 0)
61 matches = matches_find(q_seq, s_seq, q_min, s_min, q_max, s_max, kmer)
65 matches_score(matches, q_min, s_min, q_max, s_max)
67 # matches.each { |m| puts m.to_s(q_seq) }
69 unless @matches.empty?
70 matches = matches.select { |match| match.score > 0 }
73 if best_match = matches.pop
74 @matches << best_match
78 l_q_max = best_match.q_beg - 1
79 l_s_max = best_match.s_beg - 1
81 r_q_min = best_match.q_end + 1
82 r_s_min = best_match.s_end + 1
86 if l_q_max - l_q_min > 0 and l_s_max - l_s_min > 0
87 align_recurse(q_seq, s_seq, l_q_min, l_s_min, l_q_max, l_s_max, kmer, matches)
90 if r_q_max - r_q_min > 0 and r_s_max - r_s_min > 0
91 align_recurse(q_seq, s_seq, r_q_min, r_s_min, r_q_max, r_s_max, kmer, matches)
96 # Method to select matches that lies within the search space.
97 def matches_select_by_space(matches, q_min, s_min, q_max, s_max)
98 new_matches = matches.select do |match|
99 match.q_beg >= q_min and
100 match.s_beg >= s_min and
101 match.q_end <= q_max and
108 def matches_score(matches, q_min, s_min, q_max, s_max)
109 matches.each do |match|
110 score_length = match_score_length(match)
111 score_diag = match_score_diag(match, q_min, s_min, q_max, s_max)
112 score_corner = match_score_corner(match, q_min, s_min, q_max, s_max)
115 match.score = score_length - score_diag - score_corner
117 puts "score_length: #{score_length} score_diag: #{score_diag} score_corner: #{score_corner} score: #{match.score}"
120 matches.sort_by! { |match| match.score }
123 def match_score_length(match)
127 def match_score_diag(match, q_min, s_min, q_max, s_max)
128 dist1 = (match.q_beg - match.s_beg).abs
129 dist2 = ((match.q_end - match.q_end).abs - dist1).abs
131 dist1 < dist2 ? dist1 : dist2
134 def match_score_corner(match, q_min, s_min, q_max, s_max)
139 # Method that finds all maximally expanded non-redundant matches shared
140 # between two sequences inside a given search space.
141 def matches_find(q_seq, s_seq, q_min, s_min, q_max, s_max, kmer)
143 redundant = Hash.new { |h, k| h[k] = [] }
145 s_index = index_seq(s_seq, s_min, s_max, kmer)
149 while q_pos <= q_max - kmer + 1
150 q_oligo = q_seq[q_pos ... q_pos + kmer]
152 s_index[q_oligo].each do |s_pos|
153 match = Match.new(q_pos, s_pos, kmer)
155 unless match_redundant?(redundant, match)
156 match_expand(match, q_seq, s_seq, q_min, s_min, q_max, s_max)
159 match_redundant_add(redundant, match)
169 # Method that indexes a sequence within a given interval such that the
170 # index contains all oligos of a given kmer size and the positions where
171 # this oligo was located.
172 def index_seq(seq, min, max, kmer)
173 index_hash = Hash.new { |h, k| h[k] = [] }
177 while pos <= max - kmer + 1
178 oligo = seq[pos ... pos + kmer]
179 index_hash[oligo] << pos
187 # Method to check if a match is redundant.
188 def match_redundant?(redundant, match)
189 redundant[match.q_beg].each do |s_interval|
190 if s_interval.include? match.s_beg and s_interval.include? match.s_end
198 # Method that adds a match to the redundancy index.
199 def match_redundant_add(redundant, match)
200 (match.q_beg .. match.q_end).each do |q|
201 redundant[q] << (match.s_beg .. match.s_end)
205 # Method that expands a match as far as possible to the left and right.
206 def match_expand(match, q_seq, s_seq, q_min, s_min, q_max, s_max)
207 match_expand_left(match, q_seq, s_seq, q_min, s_min)
208 match_expand_right(match, q_seq, s_seq, q_max, s_max)
213 # Method that expands a match as far as possible to the left.
214 def match_expand_left(match, q_seq, s_seq, q_min, s_min)
215 while match.q_beg > q_min and
216 match.s_beg > s_min and
217 q_seq[match.q_beg - 1] == s_seq[match.s_beg - 1]
226 # Method that expands a match as far as possible to the right.
227 def match_expand_right(match, q_seq, s_seq, q_max, s_max)
228 while match.q_end < q_max and
229 match.s_end < s_max and
230 q_seq[match.q_end + 1] == s_seq[match.s_end + 1]
237 # Method for debugging purposes that upcase matching sequence while non-matches
238 # sequence is kept in lower case.
240 @matches.each do |match|
241 @q_entry.seq[match.q_beg .. match.q_end] = @q_entry.seq[match.q_beg .. match.q_end].upcase
242 @s_entry.seq[match.s_beg .. match.s_end] = @s_entry.seq[match.s_beg .. match.s_end].upcase
246 # Method that insert gaps in sequences based on a list of matches and thus
247 # creating an alignment.
249 @matches.sort_by! { |m| m.q_beg }
254 match = @matches.first
255 diff = (q_gaps + match.q_beg) - (s_gaps + match.s_beg)
258 @q_entry.seq.insert(0, "-" * diff.abs)
261 @s_entry.seq.insert(0, "-" * diff.abs)
265 @matches[1 .. -1].each do |match|
266 diff = (q_gaps + match.q_beg) - (s_gaps + match.s_beg)
269 @q_entry.seq.insert(match.q_beg + q_gaps, "-" * diff.abs)
272 @s_entry.seq.insert(match.s_beg + s_gaps, "-" * diff.abs)
277 diff = @q_entry.length - @s_entry.length
280 @q_entry.seq << ("-" * diff.abs)
282 @s_entry.seq << ("-" * diff.abs)
287 # Class for containing a match between two sequences q and s.
289 attr_accessor :q_beg, :s_beg, :length, :score
291 def initialize(q_beg, s_beg, length, score = 0.0)
307 s = "q: #{@q_beg} #{q_end} s: #{@s_beg} #{s_end} l: #{@length} s: #{@score}"
308 s << " seq: #{seq[@q_beg .. q_end]}" if seq