1 # Copyright (C) 2007-2011 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # IUPAC nucleotide pair ambiguity equivalents are saved in an
26 # array of bit fields.
33 EQUAL = Array.new(256, 0)
34 EQUAL['A'.ord] = BIT_A
35 EQUAL['a'.ord] = BIT_A
36 EQUAL['T'.ord] = BIT_T
37 EQUAL['t'.ord] = BIT_T
38 EQUAL['U'.ord] = BIT_T
39 EQUAL['u'.ord] = BIT_T
40 EQUAL['C'.ord] = BIT_C
41 EQUAL['c'.ord] = BIT_C
42 EQUAL['G'.ord] = BIT_G
43 EQUAL['g'.ord] = BIT_G
44 EQUAL['M'.ord] = (BIT_A|BIT_C)
45 EQUAL['m'.ord] = (BIT_A|BIT_C)
46 EQUAL['R'.ord] = (BIT_A|BIT_G)
47 EQUAL['r'.ord] = (BIT_A|BIT_G)
48 EQUAL['W'.ord] = (BIT_A|BIT_T)
49 EQUAL['w'.ord] = (BIT_A|BIT_T)
50 EQUAL['S'.ord] = (BIT_C|BIT_G)
51 EQUAL['s'.ord] = (BIT_C|BIT_G)
52 EQUAL['Y'.ord] = (BIT_C|BIT_T)
53 EQUAL['y'.ord] = (BIT_C|BIT_T)
54 EQUAL['K'.ord] = (BIT_G|BIT_T)
55 EQUAL['k'.ord] = (BIT_G|BIT_T)
56 EQUAL['B'.ord] = (BIT_C|BIT_G|BIT_T)
57 EQUAL['b'.ord] = (BIT_C|BIT_G|BIT_T)
58 EQUAL['D'.ord] = (BIT_A|BIT_G|BIT_T)
59 EQUAL['d'.ord] = (BIT_A|BIT_G|BIT_T)
60 EQUAL['H'.ord] = (BIT_A|BIT_C|BIT_T)
61 EQUAL['h'.ord] = (BIT_A|BIT_C|BIT_T)
62 EQUAL['V'.ord] = (BIT_A|BIT_C|BIT_G)
63 EQUAL['v'.ord] = (BIT_A|BIT_C|BIT_G)
64 EQUAL['N'.ord] = (BIT_A|BIT_C|BIT_G|BIT_T)
65 EQUAL['n'.ord] = (BIT_A|BIT_C|BIT_G|BIT_T)
67 # Module containing code to locate nucleotide patterns in sequences allowing for
68 # ambiguity codes and a given maximum edit distance.
69 # Insertions are nucleotides found in the pattern but not in the sequence.
70 # Deletions are nucleotides found in the sequence but not in the pattern.
72 # Inspired by the paper by Bruno Woltzenlogel Paleo (page 197):
73 # http://www.logic.at/people/bruno/Papers/2007-GATE-ESSLLI.pdf
75 # ------------------------------------------------------------------------------
76 # str.match(pattern[, pos[, max_edit_distance]])
79 # ------------------------------------------------------------------------------
80 # Method to locate the next pattern match starting from a given position. A match
81 # is allowed to contain a given maximum edit distance. If a match is located a
82 # Match object will be returned otherwise nil.
83 def match(pattern, pos = 0, max_edit_distance = 0)
86 @max_edit_distance = max_edit_distance
89 while @pos < @seq.length
92 return match_new if match_found?
98 # ------------------------------------------------------------------------------
99 # str.scan(pattern[, pos[, max_edit_distance]])
101 # str.scan(pattern[, pos[, max_edit_distance]]) { |match|
106 # ------------------------------------------------------------------------------
107 # Method to iterate through a sequence to locate pattern matches starting
108 # from a given position and allowing for a maximum edit distance.
109 # Matches found in block context return the Match object. Otherwise matches are
110 # returned in an Array.
111 def scan(pattern, pos = 0, max_edit_distance = 0)
114 while match = match(pattern, pos, max_edit_distance)
124 return matches unless block_given?
129 # Method to initailize the score vector and return this.
133 (0 ... @pattern.length + 1).each do |i|
134 vector[i] = Score.new(matches = 0, mismatches = 0, insertions = i)
140 # Method to update the score vector.
142 score_diag = @vector[0]
143 score_up = Score.new # insertion
144 score_left = @vector[1] # deletion
146 (0 ... @pattern.length).each do |i|
147 if match?(@seq[@pos], @pattern[i])
148 new_score = score_diag.dup
149 new_score.matches += 1
151 if deletion?(score_diag, score_up, score_left)
152 new_score = score_left.dup
153 new_score.deletions += 1
154 elsif mismatch?(score_diag, score_up, score_left)
155 new_score = score_diag.dup
156 new_score.mismatches += 1
157 elsif insertion?(score_diag, score_up, score_left)
158 new_score = score_up.dup
159 new_score.insertions += 1
163 score_diag = @vector[i + 1]
165 score_left = @vector[i + 2]
167 @vector[i + 1] = new_score
171 # Method to determine if a match occurred.
172 def match?(char1, char2)
173 (EQUAL[char1.ord] & EQUAL[char2.ord]) != 0
176 # Method to determine if a mismatch occured.
177 def mismatch?(score_diag, score_up, score_left)
178 if score_diag.edit_distance <= score_up.edit_distance and
179 score_diag.edit_distance <= score_left.edit_distance
184 # Method to determine if an insertion occured.
185 def insertion?(score_diag, score_up, score_left)
186 if score_up.edit_distance <= score_diag.edit_distance and
187 score_up.edit_distance <= score_left.edit_distance
192 # Method to determine if a deletion occured.
193 def deletion?(score_diag, score_up, score_left)
194 if score_left.edit_distance <= score_diag.edit_distance and
195 score_left.edit_distance <= score_up.edit_distance
200 # Method to print the score vector.
209 # Method that returns a Match object initialized with
210 # information from the score vector.
212 matches = @vector.last.matches
213 mismatches = @vector.last.mismatches
214 insertions = @vector.last.insertions
215 deletions = @vector.last.deletions
216 length = @pattern.length - insertions + deletions
217 pos = @pos - length + 1
218 match = @seq[pos ... pos + length]
220 Match.new(pos, match, matches, mismatches, insertions, deletions, length)
223 # Method that determines if a match was found by analyzing the score vector.
225 if @vector.last.edit_distance <= @max_edit_distance
230 # Class to instantiate Score objects that holds score information.
232 attr_accessor :matches, :mismatches, :insertions, :deletions
234 def initialize(matches = 0, mismatches = 0, insertions = 0, deletions = 0)
236 @mismatches = mismatches
237 @insertions = insertions
238 @deletions = deletions
241 # Method to calculate and return the edit distance.
243 self.mismatches + self.insertions + self.deletions
249 "(#{[self.matches, self.mismatches, self.insertions, self.deletions].join(',')})"
253 # Class for creating Match objects which contain the description of a
254 # match between a nucleotide sequence and a pattern.
256 attr_reader :pos, :match, :matches, :mismatches, :insertions, :deletions, :edit_distance, :length
258 def initialize(pos, match, matches, mismatches, insertions, deletions, length)
262 @mismatches = mismatches
263 @insertions = insertions
264 @deletions = deletions
265 @edit_distance = mismatches + insertions + deletions