1 # Copyright (C) 2007-2011 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # IUPAC nucleotide pair ambiguity equivalents are saved in an
26 # array of bit fields.
33 EQUAL = Array.new(256, 0)
34 EQUAL['A'.ord] = BIT_A
35 EQUAL['a'.ord] = BIT_A
36 EQUAL['T'.ord] = BIT_T
37 EQUAL['t'.ord] = BIT_T
38 EQUAL['U'.ord] = BIT_T
39 EQUAL['u'.ord] = BIT_T
40 EQUAL['C'.ord] = BIT_C
41 EQUAL['c'.ord] = BIT_C
42 EQUAL['G'.ord] = BIT_G
43 EQUAL['g'.ord] = BIT_G
44 EQUAL['M'.ord] = (BIT_A|BIT_C)
45 EQUAL['m'.ord] = (BIT_A|BIT_C)
46 EQUAL['R'.ord] = (BIT_A|BIT_G)
47 EQUAL['r'.ord] = (BIT_A|BIT_G)
48 EQUAL['W'.ord] = (BIT_A|BIT_T)
49 EQUAL['w'.ord] = (BIT_A|BIT_T)
50 EQUAL['S'.ord] = (BIT_C|BIT_G)
51 EQUAL['s'.ord] = (BIT_C|BIT_G)
52 EQUAL['Y'.ord] = (BIT_C|BIT_T)
53 EQUAL['y'.ord] = (BIT_C|BIT_T)
54 EQUAL['K'.ord] = (BIT_G|BIT_T)
55 EQUAL['k'.ord] = (BIT_G|BIT_T)
56 EQUAL['B'.ord] = (BIT_C|BIT_G|BIT_T)
57 EQUAL['b'.ord] = (BIT_C|BIT_G|BIT_T)
58 EQUAL['D'.ord] = (BIT_A|BIT_G|BIT_T)
59 EQUAL['d'.ord] = (BIT_A|BIT_G|BIT_T)
60 EQUAL['H'.ord] = (BIT_A|BIT_C|BIT_T)
61 EQUAL['h'.ord] = (BIT_A|BIT_C|BIT_T)
62 EQUAL['V'.ord] = (BIT_A|BIT_C|BIT_G)
63 EQUAL['v'.ord] = (BIT_A|BIT_C|BIT_G)
64 EQUAL['N'.ord] = (BIT_A|BIT_C|BIT_G|BIT_T)
65 EQUAL['n'.ord] = (BIT_A|BIT_C|BIT_G|BIT_T)
67 # Module containing code to locate nucleotide patterns in sequences allowing for
68 # ambiguity codes and a given maximum edit distance.
69 # Insertions are nucleotides found in the pattern but not in the sequence.
70 # Deletions are nucleotides found in the sequence but not in the pattern.
72 # Inspired by the paper by Bruno Woltzenlogel Paleo (page 197):
73 # http://www.logic.at/people/bruno/Papers/2007-GATE-ESSLLI.pdf
75 # ------------------------------------------------------------------------------
76 # str.match(pattern[, pos[, max_edit_distance]])
79 # ------------------------------------------------------------------------------
80 # Method to locate the next pattern match starting from a given position. A match
81 # is allowed to contain a given maximum edit distance. If a match is located a
82 # Match object will be returned otherwise nil.
83 def match(pattern, pos = 0, max_edit_distance = 0)
86 @max_edit_distance = max_edit_distance
89 while @pos < @seq.length
92 return match_new if match_found?
98 # ------------------------------------------------------------------------------
99 # str.scan(pattern[, pos[, max_edit_distance]])
101 # str.scan(pattern[, pos[, max_edit_distance]]) { |match|
106 # ------------------------------------------------------------------------------
107 # Method to iterate through a sequence to locate pattern matches starting
108 # from a given position and allowing for a maximum edit distance.
109 # Matches found in block context return the Match object. Otherwise matches are
110 # returned in an Array.
111 def scan(pattern, pos = 0, max_edit_distance = 0)
114 while match = match(pattern, pos, max_edit_distance)
124 return matches unless block_given?
129 # Method to initailize the score vector and return this.
133 (0 ... @pattern.length + 1).each do |i|
134 vector[i] = Score.new(matches = 0, mismatches = 0, insertions = i)
140 # Method to update the score vector.
143 new_vector[0] = Score.new
145 (0 ... @pattern.length).each do |i|
146 score_diag = @vector[i]
147 score_up = new_vector[i] # insertion
148 score_left = @vector[i + 1] # deletion
150 if match?(@seq[@pos], @pattern[i])
151 new_score = score_diag.dup
152 new_score.matches += 1
154 if deletion?(score_diag, score_up, score_left)
155 new_score = score_left.dup
156 new_score.deletions += 1
157 elsif mismatch?(score_diag, score_up, score_left)
158 new_score = score_diag.dup
159 new_score.mismatches += 1
160 elsif insertion?(score_diag, score_up, score_left)
161 new_score = score_up.dup
162 new_score.insertions += 1
166 new_vector[i + 1] = new_score
172 # Method to determine if a match occurred.
173 def match?(char1, char2)
174 (EQUAL[char1.ord] & EQUAL[char2.ord]) != 0
177 # Method to determine if a mismatch occured.
178 def mismatch?(mismatch, insertion, deletion)
179 if mismatch.edit_distance <= insertion.edit_distance and
180 mismatch.edit_distance <= deletion.edit_distance
185 # Method to determine if an insertion occured.
186 def insertion?(mismatch, insertion, deletion)
187 if insertion.edit_distance <= mismatch.edit_distance and
188 insertion.edit_distance <= deletion.edit_distance
193 # Method to determine if a deletion occured.
194 def deletion?(mismatch, insertion, deletion)
195 if deletion.edit_distance <= mismatch.edit_distance and
196 deletion.edit_distance <= insertion.edit_distance
201 # Method to print the score vector.
210 # Method that returns a Match object initialized with
211 # information from the score vector.
213 matches = @vector.last.matches
214 mismatches = @vector.last.mismatches
215 insertions = @vector.last.insertions
216 deletions = @vector.last.deletions
217 length = @pattern.length - insertions + deletions
218 pos = @pos - length + 1
219 match = @seq[pos ... pos + length]
221 Match.new(pos, match, matches, mismatches, insertions, deletions, length)
224 # Method that determines if a match was found by analyzing the score vector.
226 if @vector.last.edit_distance <= @max_edit_distance
231 # Class to instantiate Score objects that holds score information.
233 attr_accessor :matches, :mismatches, :insertions, :deletions
235 def initialize(matches = 0, mismatches = 0, insertions = 0, deletions = 0)
237 @mismatches = mismatches
238 @insertions = insertions
239 @deletions = deletions
242 # Method to calculate and return the edit distance.
244 self.mismatches + self.insertions + self.deletions
250 "(#{[self.matches, self.mismatches, self.insertions, self.deletions].join(',')})"
254 # Class for creating Match objects which contain the description of a
255 # match between a nucleotide sequence and a pattern.
257 attr_reader :pos, :match, :matches, :mismatches, :insertions, :deletions, :edit_distance, :length
259 def initialize(pos, match, matches, mismatches, insertions, deletions, length)
263 @mismatches = mismatches
264 @insertions = insertions
265 @deletions = deletions
266 @edit_distance = mismatches + insertions + deletions