1 # Copyright (C) 2007-2011 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # IUPAC nucleotide pair ambiguity equivalents are saved in an
26 # array of bit fields.
33 EQUAL = Array.new(256, 0)
34 EQUAL['A'.ord] = BIT_A
35 EQUAL['a'.ord] = BIT_A
36 EQUAL['T'.ord] = BIT_T
37 EQUAL['t'.ord] = BIT_T
38 EQUAL['U'.ord] = BIT_T
39 EQUAL['u'.ord] = BIT_T
40 EQUAL['C'.ord] = BIT_C
41 EQUAL['c'.ord] = BIT_C
42 EQUAL['G'.ord] = BIT_G
43 EQUAL['g'.ord] = BIT_G
44 EQUAL['M'.ord] = (BIT_A|BIT_C)
45 EQUAL['m'.ord] = (BIT_A|BIT_C)
46 EQUAL['R'.ord] = (BIT_A|BIT_G)
47 EQUAL['r'.ord] = (BIT_A|BIT_G)
48 EQUAL['W'.ord] = (BIT_A|BIT_T)
49 EQUAL['w'.ord] = (BIT_A|BIT_T)
50 EQUAL['S'.ord] = (BIT_C|BIT_G)
51 EQUAL['s'.ord] = (BIT_C|BIT_G)
52 EQUAL['Y'.ord] = (BIT_C|BIT_T)
53 EQUAL['y'.ord] = (BIT_C|BIT_T)
54 EQUAL['K'.ord] = (BIT_G|BIT_T)
55 EQUAL['k'.ord] = (BIT_G|BIT_T)
56 EQUAL['B'.ord] = (BIT_C|BIT_G|BIT_T)
57 EQUAL['b'.ord] = (BIT_C|BIT_G|BIT_T)
58 EQUAL['D'.ord] = (BIT_A|BIT_G|BIT_T)
59 EQUAL['d'.ord] = (BIT_A|BIT_G|BIT_T)
60 EQUAL['H'.ord] = (BIT_A|BIT_C|BIT_T)
61 EQUAL['h'.ord] = (BIT_A|BIT_C|BIT_T)
62 EQUAL['V'.ord] = (BIT_A|BIT_C|BIT_G)
63 EQUAL['v'.ord] = (BIT_A|BIT_C|BIT_G)
64 EQUAL['N'.ord] = (BIT_A|BIT_C|BIT_G|BIT_T)
65 EQUAL['n'.ord] = (BIT_A|BIT_C|BIT_G|BIT_T)
67 # Module containing code to locate nucleotide patterns in sequences allowing for
68 # ambiguity codes and a given maximum edit distance.
69 # Insertions are nucleotides found in the pattern but not in the sequence.
70 # Deletions are nucleotides found in the sequence but not in the pattern.
72 # Inspired by the paper by Bruno Woltzenlogel Paleo (page 197):
73 # http://www.logic.at/people/bruno/Papers/2007-GATE-ESSLLI.pdf
75 # ------------------------------------------------------------------------------
76 # str.match(pattern[, pos[, max_edit_distance]])
79 # ------------------------------------------------------------------------------
80 # Method to locate the next pattern match starting from a given position. A match
81 # is allowed to contain a given maximum edit distance. If a match is located a
82 # Match object will be returned otherwise nil.
83 def match(pattern, pos = 0, max_edit_distance = 0)
86 @max_edit_distance = max_edit_distance
89 while @pos < @seq.length
92 return match_new if match_found?
98 # ------------------------------------------------------------------------------
99 # str.scan(pattern[, pos[, max_edit_distance]])
101 # str.scan(pattern[, pos[, max_edit_distance]]) { |match|
106 # ------------------------------------------------------------------------------
107 # Method to iterate through a sequence to locate pattern matches starting
108 # from a given position and allowing for a maximum edit distance.
109 # Matches found in block context return the Match object. Otherwise matches are
110 # returned in an Array.
111 def scan(pattern, pos = 0, max_edit_distance = 0)
114 while match = match(pattern, pos, max_edit_distance)
124 return matches unless block_given?
129 # Method to initailize the score vector and return this.
133 (0 ... @pattern.length + 1).each do |i|
134 vector[i] = Score.new(matches = 0, mismatches = 0, insertions = i)
140 # Method to update the score vector.
142 new_vector = @vector.dup
144 (0 ... @pattern.length).each do |i|
145 if match?(@seq[@pos], @pattern[i])
146 new_vector[i + 1] = @vector[i].dup
147 new_vector[i + 1].matches += 1
149 mismatch = @vector[i].dup
150 insertion = new_vector[i].dup
151 deletion = @vector[i + 1].dup
153 if deletion?(mismatch, insertion, deletion)
154 deletion.deletions += 1
155 new_vector[i + 1] = deletion
156 elsif mismatch?(mismatch, insertion, deletion)
157 mismatch.mismatches += 1
158 new_vector[i + 1] = mismatch
159 elsif insertion?(mismatch, insertion, deletion)
160 insertion.insertions += 1
161 new_vector[i + 1] = insertion
169 # Method to determine if a match occurred.
170 def match?(char1, char2)
171 (EQUAL[char1.ord] & EQUAL[char2.ord]) != 0
174 # Method to determine if a mismatch occured.
175 def mismatch?(mismatch, insertion, deletion)
176 if mismatch.edit_distance <= insertion.edit_distance and
177 mismatch.edit_distance <= deletion.edit_distance
182 # Method to determine if an insertion occured.
183 def insertion?(mismatch, insertion, deletion)
184 if insertion.edit_distance <= mismatch.edit_distance and
185 insertion.edit_distance <= deletion.edit_distance
190 # Method to determine if a deletion occured.
191 def deletion?(mismatch, insertion, deletion)
192 if deletion.edit_distance <= mismatch.edit_distance and
193 deletion.edit_distance <= insertion.edit_distance
198 # Method to print the score vector.
207 # Method that returns a Match object initialized with
208 # information from the score vector.
210 matches = @vector.last.matches
211 mismatches = @vector.last.mismatches
212 insertions = @vector.last.insertions
213 deletions = @vector.last.deletions
214 length = @pattern.length - insertions + deletions
215 pos = @pos - length + 1
216 match = @seq[pos ... pos + length]
218 Match.new(pos, match, matches, mismatches, insertions, deletions, length)
221 # Method that determines if a match was found by analyzing the score vector.
223 if @vector.last.edit_distance <= @max_edit_distance
228 # Class to instantiate Score objects that holds score information.
230 attr_accessor :matches, :mismatches, :insertions, :deletions
232 def initialize(matches = 0, mismatches = 0, insertions = 0, deletions = 0)
234 @mismatches = mismatches
235 @insertions = insertions
236 @deletions = deletions
239 # Method to calculate and return the edit distance.
241 self.mismatches + self.insertions + self.deletions
247 "(#{[self.matches, self.mismatches, self.insertions, self.deletions].join(',')})"
251 # Class for creating Match objects which contain the description of a
252 # match between a nucleotide sequence and a pattern.
254 attr_reader :pos, :match, :matches, :mismatches, :insertions, :deletions, :edit_distance, :length
256 def initialize(pos, match, matches, mismatches, insertions, deletions, length)
260 @mismatches = mismatches
261 @insertions = insertions
262 @deletions = deletions
263 @edit_distance = mismatches + insertions + deletions