1 # Copyright (C) 2007-2012 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 require 'maasha/align/match'
26 require 'maasha/math_aux'
28 FACTOR_SCORE_LENGTH = 1.0
29 FACTOR_SCORE_DIAG = -1.41
31 # Module with stuff to create a pairwise aligment.
33 # Class for creating a pairwise alignment.
35 # Class method to create a pairwise alignment of two given Seq objects.
36 def self.align(q_entry, s_entry)
37 self.new(q_entry, s_entry)
40 # Method to inialize a pairwise alignment given two Seq objects.
41 def initialize(q_entry, s_entry)
46 @q_entry.seq.downcase!
47 @s_entry.seq.downcase!
49 space = Space.new( 0, 0, @q_entry.length - 1, @s_entry.length - 1)
52 align_recurse(@q_entry.seq, @s_entry.seq, space, kmer)
59 # Method that creates an alignment by chaining matches, which are
60 # subsequences shared between two sequences. This recursive method
61 # functions by considering only matches within a given search space. If no
62 # matches are given these will be located and matches will be included
63 # depending on a calculated score. New search spaces spanning the spaces
64 # between the best scoring matches and the search space boundaries will be
65 # cast and recursed into.
66 def align_recurse(q_seq, s_seq, space, kmer, matches = [])
67 matches = matches_select_by_space(matches, space)
68 matches = matches_select_by_score(matches, space)
70 while (matches.size == 0 and kmer > 0)
71 matches = Matches.find(q_seq, s_seq, space.q_min, space.s_min, space.q_max, space.s_max, kmer)
74 matches.sort_by! { |m| m.length }
76 matches = matches_select_by_score(matches, space)
79 # $stderr.puts "#{kmer} #{matches.size} #{space.q_dim} x #{space.s_dim}"
83 if best_match = matches.pop
84 @matches << best_match
86 space_left = Space.new(space.q_min, space.s_min, best_match.q_beg - 1, best_match.s_beg - 1)
87 space_right = Space.new(best_match.q_end + 1, best_match.s_end + 1, space.q_max, space.s_max)
89 align_recurse(q_seq, s_seq, space_left, kmer, matches) unless space_left.empty?
90 align_recurse(q_seq, s_seq, space_right, kmer, matches) unless space_right.empty?
94 # Method to select matches that lies within the search space.
95 def matches_select_by_space(matches, space)
96 new_matches = matches.select do |match|
97 match.q_beg >= space.q_min and
98 match.s_beg >= space.s_min and
99 match.q_end <= space.q_max and
100 match.s_end <= space.s_max
106 # Method to select matches based on score.
107 def matches_select_by_score(matches, space)
108 matches_score(matches, space)
110 matches.select { |match| match.score > 0 }
113 def matches_score(matches, space)
114 matches.each do |match|
115 score_length = match_score_length(match)
116 score_diag = match_score_diag(match, space)
118 match.score = score_length + score_diag
121 # $stderr.puts "score_length: #{score_length} score_diag: #{score_diag} score: #{match.score}" # DEBUG
125 matches.sort_by! { |match| match.score }
128 def match_score_length(match)
129 match.length * FACTOR_SCORE_LENGTH
132 def match_score_diag(match, space)
133 if space.q_dim > space.s_dim # s_dim is the narrow end
134 dist_beg = Math.dist_point2line(match.q_beg,
138 space.q_min + space.s_dim,
139 space.s_min + space.s_dim)
141 dist_end = Math.dist_point2line( match.q_beg,
143 space.q_max - space.s_dim,
144 space.s_max - space.s_dim,
148 dist_beg = Math.dist_point2line( match.q_beg,
152 space.q_min + space.q_dim,
153 space.s_min + space.q_dim)
155 dist_end = Math.dist_point2line( match.q_beg,
157 space.q_max - space.q_dim,
158 space.s_max - space.q_dim,
163 dist_min = dist_beg < dist_end ? dist_beg : dist_end
165 dist_min * FACTOR_SCORE_DIAG
168 # Method that finds all maximally expanded non-redundant matches shared
169 # between two sequences inside a given search space.
170 def matches_find(q_seq, s_seq, q_min, s_min, q_max, s_max, kmer)
172 redundant = Hash.new { |h, k| h[k] = [] }
174 s_index = index_seq(s_seq, s_min, s_max, kmer)
178 while q_pos <= q_max - kmer + 1
179 q_oligo = q_seq[q_pos ... q_pos + kmer]
181 s_index[q_oligo].each do |s_pos|
182 match = Match.new(q_pos, s_pos, kmer)
184 unless match_redundant?(redundant, match)
185 match_expand(match, q_seq, s_seq, q_min, s_min, q_max, s_max)
188 match_redundant_add(redundant, match)
198 # Method that indexes a sequence within a given interval such that the
199 # index contains all oligos of a given kmer size and the positions where
200 # this oligo was located.
201 def index_seq(seq, min, max, kmer)
202 index_hash = Hash.new { |h, k| h[k] = [] }
206 while pos <= max - kmer + 1
207 oligo = seq[pos ... pos + kmer]
208 index_hash[oligo] << pos
216 # Method to check if a match is redundant.
217 def match_redundant?(redundant, match)
218 redundant[match.q_beg].each do |s_interval|
219 if s_interval.include? match.s_beg and s_interval.include? match.s_end
227 # Method that adds a match to the redundancy index.
228 def match_redundant_add(redundant, match)
229 (match.q_beg .. match.q_end).each do |q|
230 redundant[q] << (match.s_beg .. match.s_end)
234 # Method that expands a match as far as possible to the left and right.
235 def match_expand(match, q_seq, s_seq, q_min, s_min, q_max, s_max)
236 match_expand_left(match, q_seq, s_seq, q_min, s_min)
237 match_expand_right(match, q_seq, s_seq, q_max, s_max)
242 # Method that expands a match as far as possible to the left.
243 def match_expand_left(match, q_seq, s_seq, q_min, s_min)
244 while match.q_beg > q_min and
245 match.s_beg > s_min and
246 q_seq[match.q_beg - 1] == s_seq[match.s_beg - 1]
255 # Method that expands a match as far as possible to the right.
256 def match_expand_right(match, q_seq, s_seq, q_max, s_max)
257 while match.q_end < q_max and
258 match.s_end < s_max and
259 q_seq[match.q_end + 1] == s_seq[match.s_end + 1]
266 # Method for debugging purposes that upcase matching sequence while non-matches
267 # sequence is kept in lower case.
269 @matches.each do |match|
270 @q_entry.seq[match.q_beg .. match.q_end] = @q_entry.seq[match.q_beg .. match.q_end].upcase
271 @s_entry.seq[match.s_beg .. match.s_end] = @s_entry.seq[match.s_beg .. match.s_end].upcase
275 # Method that insert gaps in sequences based on a list of matches and thus
276 # creating an alignment.
278 @matches.sort_by! { |m| m.q_beg }
283 match = @matches.first
284 diff = (q_gaps + match.q_beg) - (s_gaps + match.s_beg)
287 @q_entry.seq.insert(0, "-" * diff.abs)
290 @s_entry.seq.insert(0, "-" * diff.abs)
294 @matches[1 .. -1].each do |m|
295 diff = (q_gaps + m.q_beg) - (s_gaps + m.s_beg)
298 @q_entry.seq.insert(m.q_beg + q_gaps, "-" * diff.abs)
301 @s_entry.seq.insert(m.s_beg + s_gaps, "-" * diff.abs)
306 diff = @q_entry.length - @s_entry.length
309 @q_entry.seq << ("-" * diff.abs)
311 @s_entry.seq << ("-" * diff.abs)
316 # Class for containing a search space between two sequences q and s.
318 attr_reader :q_min, :s_min, :q_max, :s_max
320 def initialize(q_min, s_min, q_max, s_max)
336 if @q_max - @q_min >= 0 and @s_max - @s_min >= 0
344 # Class for containing a match between two sequences q and s.
346 attr_accessor :q_beg, :s_beg, :length, :score
348 def initialize(q_beg, s_beg, length, score = 0.0)
364 s = "q: #{@q_beg} #{q_end} s: #{@s_beg} #{s_end} l: #{@length} s: #{@score}"
365 s << " seq: #{seq[@q_beg .. q_end]}" if seq