1 # Copyright (C) 2007-2011 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 require 'inline' # RubyInline
27 # Error class for all exceptions to do with BackTrack.
28 class BackTrackError < StandardError; end
30 # Module containing code to locate nucleotide patterns in sequences allowing for
31 # ambiguity codes and a given maximum mismatches, insertions, and deletions. The
32 # pattern match engine is based on a backtrack algorithm.
33 # Insertions are nucleotides found in the pattern but not in the sequence.
34 # Deletions are nucleotides found in the sequence but not in the pattern.
35 # Algorithm based on code kindly provided by j_random_hacker @ Stackoverflow:
36 # http://stackoverflow.com/questions/7557017/approximate-string-matching-using-backtracking/
38 OK_PATTERN = Regexp.new('^[bflsycwphqrimtnkvadegu]+$')
39 MAX_MIS = 5 # Maximum number of mismatches allowed
40 MAX_INS = 5 # Maximum number of insertions allowed
41 MAX_DEL = 5 # Maximum number of deletions allowed
43 # ------------------------------------------------------------------------------
44 # str.patmatch(pattern[, start|[start, stop][, max_mis[, max_ins[, max_del]]]])
46 # str.patmatch(pattern[, start|[start, stop][, max_mis[, max_ins[, max_del]]]]) { |match|
51 # ------------------------------------------------------------------------------
52 # Method to iterate through a sequence from a given start position to the end of
53 # the sequence or to a given stop position to locate a pattern allowing for a
54 # maximum number of mismatches, insertions, and deletions. Insertions are
55 # nucleotides found in the pattern but not in the sequence. Deletions are
56 # nucleotides found in the sequence but not in the pattern.
57 def patmatch(pattern, start = 0, max_mismatches = 0, max_insertions = 0, max_deletions = 0)
58 self.patscan(pattern, start, max_mismatches, max_insertions, max_deletions) do |m|
75 # ------------------------------------------------------------------------------
76 # str.patscan(pattern[, start|[start, stop][, max_mis[, max_ins[, max_del]]]])
78 # str.patscan(pattern[, start|[start, stop][, max_mis[, max_ins[, max_del]]]]) { |match|
83 # ------------------------------------------------------------------------------
84 # Method to iterate through a sequence from a given start position to the end of
85 # the sequence or to a given stop position to locate a pattern allowing for a
86 # maximum number of mismatches, insertions, and deletions. Insertions are
87 # nucleotides found in the pattern but not in the sequence. Deletions are
88 # nucleotides found in the sequence but not in the pattern. Matches found in
89 # block context return the Match object. Otherwise matches are returned in an
90 # Array of Match objects.
91 def patscan(pattern, start = 0, max_mismatches = 0, max_insertions = 0, max_deletions = 0)
95 stop = self.length - 1
98 raise BackTrackError, "Bad pattern: #{pattern}" unless pattern.downcase =~ OK_PATTERN
99 raise BackTrackError, "start: #{start} out of range (0 .. #{self.length - 1})" unless (0 ... self.length).include? start
100 raise BackTrackError, "stop: #{stop} out of range (0 .. #{self.length - 1})" unless (0 ... self.length).include? stop
101 raise BackTrackError, "max_mismatches: #{max_mismatches} out of range (0 .. #{MAX_MIS})" unless (0 .. MAX_MIS).include? max_mismatches
102 raise BackTrackError, "max_insertions: #{max_insertions} out of range (0 .. #{MAX_INS})" unless (0 .. MAX_INS).include? max_insertions
103 raise BackTrackError, "max_deletions: #{max_deletions} out of range (0 .. #{MAX_DEL})" unless (0 .. MAX_DEL).include? max_deletions
107 while result = scan_C(self.seq, pattern, start, stop, max_mismatches, max_insertions, max_deletions)
108 match = Match.new(result.first, result.last, self.seq[result.first ... result.first + result.last])
116 start = result.first + 1
119 return matches.empty? ? nil : matches unless block_given?
125 # Macro for matching nucleotides including ambiguity codes.
127 #define MATCH(A,B) ((bitmap[A] & bitmap[B]) != 0)
130 # Bitmap for matching nucleotides including ambiguity codes.
131 # For each value bits are set from the left: bit pos 1 for A,
132 # bit pos 2 for T, bit pos 3 for C, and bit pos 4 for G.
135 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0,
140 0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0,
141 0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0,
142 0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
154 # Backtrack algorithm for matching a pattern (p) starting in a sequence (s) allowing for mis
155 # mismatches, ins insertions and del deletions. ss is the start of the sequence, used only for
156 # reporting the match endpoints. State is used to avoid ins followed by del and visa versa which
159 unsigned int backtrack(
160 char *ss, // Sequence start
163 unsigned int mis, // Max mismatches
164 unsigned int ins, // Max insertions
165 unsigned int del, // Max deletions
166 int state // Last event: mis, ins or del
171 while (*s && MATCH(*s, *p)) ++s, ++p; // OK to always match longest segment
174 return (unsigned int) (s - ss);
177 if (mis && *s && *p && (r = backtrack(ss, s + 1, p + 1, mis - 1, ins, del, 0))) return r;
178 if (ins && *p && (state != -1) && (r = backtrack(ss, s, p + 1, mis, ins - 1, del, 1))) return r;
179 if (del && *s && (state != 1) && (r = backtrack(ss, s + 1, p, mis, ins, del - 1, -1))) return r;
186 # Find pattern (p) in a sequence (s) starting at pos, with at most mis mismatches, ins
187 # insertions and del deletions.
190 VALUE _s, // Sequence
192 VALUE _start, // Search postition start
193 VALUE _stop, // Search position stop
194 VALUE _mis, // Maximum mismatches
195 VALUE _ins, // Maximum insertions
196 VALUE _del // Maximum deletions
199 char *s = StringValuePtr(_s);
200 char *p = StringValuePtr(_p);
201 unsigned int start = FIX2UINT(_start);
202 unsigned int stop = FIX2UINT(_stop);
203 unsigned int mis = FIX2UINT(_mis);
204 unsigned int ins = FIX2UINT(_ins);
205 unsigned int del = FIX2UINT(_del);
215 for (i = start; i <= stop; i++, s++)
217 if ((e = backtrack(ss, s, p, mis, ins, del, state)))
219 tuple = rb_ary_new();
220 rb_ary_push(tuple, INT2FIX((int) (s - ss)));
221 rb_ary_push(tuple, INT2FIX((int) e - (s - ss)));
231 # Class containing match information.
233 attr_reader :pos, :length, :match
235 def initialize(pos, length, match)
242 "#{pos}:#{length}:#{match}"