1 # Copyright (C) 2007-2011 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # IUPAC nucleotide pair ambiguity equivalents are saved in an
26 # array of bit fields.
33 EQUAL = Array.new(256, 0)
34 EQUAL['A'.ord] = BIT_A
35 EQUAL['a'.ord] = BIT_A
36 EQUAL['T'.ord] = BIT_T
37 EQUAL['t'.ord] = BIT_T
38 EQUAL['U'.ord] = BIT_T
39 EQUAL['u'.ord] = BIT_T
40 EQUAL['C'.ord] = BIT_C
41 EQUAL['c'.ord] = BIT_C
42 EQUAL['G'.ord] = BIT_G
43 EQUAL['g'.ord] = BIT_G
44 EQUAL['M'.ord] = (BIT_A|BIT_C)
45 EQUAL['m'.ord] = (BIT_A|BIT_C)
46 EQUAL['R'.ord] = (BIT_A|BIT_G)
47 EQUAL['r'.ord] = (BIT_A|BIT_G)
48 EQUAL['W'.ord] = (BIT_A|BIT_T)
49 EQUAL['w'.ord] = (BIT_A|BIT_T)
50 EQUAL['S'.ord] = (BIT_C|BIT_G)
51 EQUAL['s'.ord] = (BIT_C|BIT_G)
52 EQUAL['Y'.ord] = (BIT_C|BIT_T)
53 EQUAL['y'.ord] = (BIT_C|BIT_T)
54 EQUAL['K'.ord] = (BIT_G|BIT_T)
55 EQUAL['k'.ord] = (BIT_G|BIT_T)
56 EQUAL['B'.ord] = (BIT_C|BIT_G|BIT_T)
57 EQUAL['b'.ord] = (BIT_C|BIT_G|BIT_T)
58 EQUAL['D'.ord] = (BIT_A|BIT_G|BIT_T)
59 EQUAL['d'.ord] = (BIT_A|BIT_G|BIT_T)
60 EQUAL['H'.ord] = (BIT_A|BIT_C|BIT_T)
61 EQUAL['h'.ord] = (BIT_A|BIT_C|BIT_T)
62 EQUAL['V'.ord] = (BIT_A|BIT_C|BIT_G)
63 EQUAL['v'.ord] = (BIT_A|BIT_C|BIT_G)
64 EQUAL['N'.ord] = (BIT_A|BIT_C|BIT_G|BIT_T)
65 EQUAL['n'.ord] = (BIT_A|BIT_C|BIT_G|BIT_T)
67 # Module containing code to locate nucleotide patterns in sequences allowing for
68 # ambiguity codes and a given maximum edit distance.
69 # Insertions are nucleotides found in the pattern but not in the sequence.
70 # Deletions are nucleotides found in the sequence but not in the pattern.
72 # Inspired by the paper by Bruno Woltzenlogel Paleo (page 197):
73 # http://www.logic.at/people/bruno/Papers/2007-GATE-ESSLLI.pdf
75 # ------------------------------------------------------------------------------
76 # str.match(pattern[, pos[, max_edit_distance]])
79 # ------------------------------------------------------------------------------
80 # Method to locate the next pattern match starting from a given position. A match
81 # is allowed to contain a given maximum edit distance. If a match is located a
82 # Match object will be returned otherwise nil.
83 def match(pattern, pos = 0, max_edit_distance = 0)
84 vector = Vector.new(@seq, pattern, max_edit_distance)
86 while pos < @seq.length
89 return vector.to_match(pos) if vector.match_found?
97 # ------------------------------------------------------------------------------
98 # str.scan(pattern[, pos[, max_edit_distance]])
100 # str.scan(pattern[, pos[, max_edit_distance]]) { |match|
105 # ------------------------------------------------------------------------------
106 # Method to iterate through a sequence to locate pattern matches starting
107 # from a given position and allowing for a maximum edit distance.
108 # Matches found in block context return the Match object. Otherwise matches are
109 # returned in an Array.
110 def scan(pattern, pos = 0, max_edit_distance = 0)
113 while match = match(pattern, pos, max_edit_distance)
123 return matches unless block_given?
127 # Class containing the score vector used for locating matches.
129 # Method to initailize the score vector.
130 def initialize(seq, pattern, max_edit_distance)
133 @max_edit_distance = max_edit_distance
136 (0 ... @pattern.length + 1).each do |i|
137 @vector[i] = Score.new(matches = 0, mismatches = 0, insertions = i, deletions = 0, edit_distance = i)
141 # Method to update the score vector.
143 score_diag = @vector[0]
144 score_up = Score.new # insertion
145 score_left = @vector[1] # deletion
147 (0 ... @pattern.length).each do |i|
148 if match?(@seq[pos], @pattern[i])
149 new_score = score_diag.dup
150 new_score.matches += 1
152 if deletion?(score_diag, score_up, score_left)
153 new_score = score_left.dup
154 new_score.deletions += 1
155 elsif mismatch?(score_diag, score_up, score_left)
156 new_score = score_diag.dup
157 new_score.mismatches += 1
158 elsif insertion?(score_diag, score_up, score_left)
159 new_score = score_up.dup
160 new_score.insertions += 1
163 new_score.edit_distance += 1
166 score_diag = @vector[i + 1]
168 score_left = @vector[i + 2]
170 @vector[i + 1] = new_score
174 # Method that determines if a match was found by analyzing the score vector.
176 if @vector.last.edit_distance <= @max_edit_distance
181 # Method that returns a Match object initialized with
182 # information from the score vector.
184 matches = @vector.last.matches
185 mismatches = @vector.last.mismatches
186 insertions = @vector.last.insertions
187 deletions = @vector.last.deletions
188 length = @pattern.length - insertions + deletions
189 offset = pos - length + 1
190 match = @seq[offset ... offset + length]
192 Match.new(offset, match, matches, mismatches, insertions, deletions, length)
195 # Method to convert the score vector to a string.
197 "(m,m,i,d,e)\n" + @vector.join("\n") + "\n\n"
202 # Method to determine if a match occurred.
203 def match?(char1, char2)
204 (EQUAL[char1.ord] & EQUAL[char2.ord]) != 0
207 # Method to determine if a mismatch occured.
208 def mismatch?(score_diag, score_up, score_left)
209 if score_diag.edit_distance <= score_up.edit_distance and
210 score_diag.edit_distance <= score_left.edit_distance
215 # Method to determine if an insertion occured.
216 def insertion?(score_diag, score_up, score_left)
217 if score_up.edit_distance <= score_diag.edit_distance and
218 score_up.edit_distance <= score_left.edit_distance
223 # Method to determine if a deletion occured.
224 def deletion?(score_diag, score_up, score_left)
225 if score_left.edit_distance <= score_diag.edit_distance and
226 score_left.edit_distance <= score_up.edit_distance
232 # Class to instantiate Score objects that holds score information.
234 attr_accessor :matches, :mismatches, :insertions, :deletions, :edit_distance
236 def initialize(matches = 0, mismatches = 0, insertions = 0, deletions = 0, edit_distance = 0)
238 @mismatches = mismatches
239 @insertions = insertions
240 @deletions = deletions
241 @edit_distance = edit_distance
245 "(#{[self.matches, self.mismatches, self.insertions, self.deletions, self.edit_distance].join(',')})"
249 # Class for creating Match objects which contain the description of a
250 # match between a nucleotide sequence and a pattern.
252 attr_reader :pos, :match, :matches, :mismatches, :insertions, :deletions, :edit_distance, :length
254 def initialize(pos, match, matches, mismatches, insertions, deletions, length)
258 @mismatches = mismatches
259 @insertions = insertions
260 @deletions = deletions
261 @edit_distance = mismatches + insertions + deletions