1 # Copyright (C) 2007-2011 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 require 'inline' # RubyInline
27 # Error class for all exceptions to do with BackTrack.
28 class BackTrackError < StandardError; end
30 # Module containing code to locate nucleotide patterns in sequences allowing for
31 # ambiguity codes and a given maximum mismatches, insertions, and deletions. The
32 # pattern match engine is based on a backtrack algorithm.
33 # Insertions are nucleotides found in the pattern but not in the sequence.
34 # Deletions are nucleotides found in the sequence but not in the pattern.
35 # Algorithm based on code kindly provided by j_random_hacker @ Stackoverflow:
36 # http://stackoverflow.com/questions/7557017/approximate-string-matching-using-backtracking/
38 OK_PATTERN = Regexp.new('^[bflsycwphqrimtnkvadegu]+$')
39 MAX_MIS = 5 # Maximum number of mismatches allowed
40 MAX_INS = 5 # Maximum number of insertions allowed
41 MAX_DEL = 5 # Maximum number of deletions allowed
43 def patmatch(pattern, start = 0, max_mismatches = 0, max_insertions = 0, max_deletions = 0)
44 self.patscan(pattern, start, max_mismatches, max_insertions, max_deletions) do |m|
61 # ------------------------------------------------------------------------------
62 # str.scan(pattern[, start|[start, stop][, max_mismatches[, max_insertions[, max_deletions]]]])
64 # str.scan(pattern[, start|[start, stop][, max_mismatches[, max_insertions[, max_deletions]]]]) { |match|
69 # ------------------------------------------------------------------------------
70 # Method to iterate through a sequence from a given start position to the end of
71 # the sequence or to a given stop position to locate a pattern allowing for a
72 # maximum number of mismatches, insertions, and deletions. Insertions are
73 # nucleotides found in the pattern but not in the sequence. Deletions are
74 # nucleotides found in the sequence but not in the pattern. Matches found in
75 # block context return the Match object. Otherwise matches are returned in an
76 # Array of Match objects.
77 def patscan(pattern, start = 0, max_mismatches = 0, max_insertions = 0, max_deletions = 0)
81 stop = self.length - 1
84 raise BackTrackError, "Bad pattern: #{pattern}" unless pattern.downcase =~ OK_PATTERN
85 # raise BackTrackError, "start: #{start} out of range (0 .. #{self.length - 1})" unless (0 .. self.length).include? start
86 # raise BackTrackError, "stop: #{stop} out of range (0 .. #{self.length - 1})" unless (0 .. self.length).include? stop
87 # raise BackTrackError, "max_mismatches: #{max_mismatches} out of range (0 .. #{MAX_MIS})" unless (0 .. MAX_MIS).include? max_mismatches
88 # raise BackTrackError, "max_insertions: #{max_insertions} out of range (0 .. #{MAX_INS})" unless (0 .. MAX_INS).include? max_insertions
89 # raise BackTrackError, "max_deletions: #{max_deletions} out of range (0 .. #{MAX_DEL})" unless (0 .. MAX_DEL).include? max_deletions
93 while result = scan_C(self.seq, pattern, start, stop, max_mismatches, max_insertions, max_deletions)
94 match = Match.new(result.first, result.last, self.seq[result.first ... result.first + result.last])
102 start = result.first + 1
105 return matches.empty? ? nil : matches unless block_given?
111 # Macro for matching nucleotides including ambiguity codes.
113 #define MATCH(A,B) ((bitmap[A] & bitmap[B]) != 0)
116 # Bitmap for matching nucleotides including ambiguity codes.
117 # For each value bits are set from the left: bit pos 1 for A,
118 # bit pos 2 for T, bit pos 3 for C, and bit pos 4 for G.
121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
122 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
125 0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0,
126 0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0,
127 0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0,
128 0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0,
129 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
130 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
131 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
140 # Backtrack algorithm for matching a pattern (p) starting in a sequence (s) allowing for mis
141 # mismatches, ins insertions and del deletions. ss is the start of the sequence, used only for
142 # reporting the match endpoints. State is used to avoid ins followed by del and visa versa which
145 unsigned int backtrack(
146 char *ss, // Sequence start
149 unsigned int mis, // Max mismatches
150 unsigned int ins, // Max insertions
151 unsigned int del, // Max deletions
152 int state // Last event: mis, ins or del
157 while (*s && MATCH(*s, *p)) ++s, ++p; // OK to always match longest segment
160 return (unsigned int) (s - ss);
163 if (mis && *s && *p && (r = backtrack(ss, s + 1, p + 1, mis - 1, ins, del, 0))) return r;
164 if (ins && *p && (state != -1) && (r = backtrack(ss, s, p + 1, mis, ins - 1, del, 1))) return r;
165 if (del && *s && (state != 1) && (r = backtrack(ss, s + 1, p, mis, ins, del - 1, -1))) return r;
172 # Find pattern (p) in a sequence (s) starting at pos, with at most mis mismatches, ins
173 # insertions and del deletions.
176 VALUE _s, // Sequence
178 VALUE _start, // Search postition start
179 VALUE _stop, // Search position stop
180 VALUE _mis, // Maximum mismatches
181 VALUE _ins, // Maximum insertions
182 VALUE _del // Maximum deletions
185 char *s = StringValuePtr(_s);
186 char *p = StringValuePtr(_p);
187 unsigned int start = FIX2UINT(_start);
188 unsigned int stop = FIX2UINT(_stop);
189 unsigned int mis = FIX2UINT(_mis);
190 unsigned int ins = FIX2UINT(_ins);
191 unsigned int del = FIX2UINT(_del);
204 if ((e = backtrack(ss, s, p, mis, ins, del, state)))
206 tuple = rb_ary_new();
207 rb_ary_push(tuple, INT2FIX((int) (s - ss)));
208 rb_ary_push(tuple, INT2FIX((int) e - (s - ss)));
221 # Class containing match information.
223 attr_reader :pos, :length, :match
225 def initialize(pos, length, match)
232 "#{pos}:#{length}:#{match}"