1 # Copyright (C) 2007-2011 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 require 'inline' # RubyInline
27 # Error class for all exceptions to do with BackTrack.
28 class BackTrackError < StandardError; end
30 # Module containing code to locate nucleotide patterns in sequences allowing for
31 # ambiguity codes and a given maximum mismatches, insertions, and deletions. The
32 # pattern match engine is based on a backtrack algorithm.
33 # Insertions are nucleotides found in the pattern but not in the sequence.
34 # Deletions are nucleotides found in the sequence but not in the pattern.
35 # Algorithm based on code kindly provided by j_random_hacker @ Stackoverflow:
36 # http://stackoverflow.com/questions/7557017/approximate-string-matching-using-backtracking/
38 OK_PATTERN = Regexp.new('^[bflsycwphqrimtnkvadegu]+$')
39 MAX_MIS = 5 # Maximum number of mismatches allowed
40 MAX_INS = 5 # Maximum number of insertions allowed
41 MAX_DEL = 5 # Maximum number of deletions allowed
43 # ------------------------------------------------------------------------------
44 # str.scan(pattern[, max_mismatches[, max_insertions[, max_deletions]]])
46 # str.scan(pattern[, max_mismatches[, max_insertions[, max_deletions]]]) { |match|
51 # ------------------------------------------------------------------------------
52 # Method to iterate through a sequence to locate pattern matches starting at a
53 # given offset allowing for a maximum number of mismatches, insertions, and
54 # deletions. Insertions are nucleotides found in the pattern but not in the
55 # sequence. Deletions are nucleotides found in the sequence but not in the
56 # pattern. Matches found in block context return the Match object. Otherwise
57 # matches are returned in an Array of Match objects.
58 def patscan(pattern, offset = 0, max_mismatches = 0, max_insertions = 0, max_deletions = 0)
59 raise BackTrackError, "Bad pattern: #{pattern}" unless pattern.downcase =~ OK_PATTERN
60 raise BackTrackError, "offset: #{offset} out of range (0 ... #{self.length})" unless (0 ... self.length).include? offset
61 raise BackTrackError, "max_mismatches: #{max_mismatches} out of range (0 .. #{MAX_MIS})" unless (0 .. MAX_MIS).include? max_mismatches
62 raise BackTrackError, "max_insertions: #{max_insertions} out of range (0 .. #{MAX_INS})" unless (0 .. MAX_INS).include? max_insertions
63 raise BackTrackError, "max_deletions: #{max_deletions} out of range (0 .. #{MAX_DEL})" unless (0 .. MAX_DEL).include? max_deletions
67 while result = track_C(self.seq, self.length, pattern, offset, max_mismatches, max_insertions, max_deletions)
68 match = Match.new(result[0], result[1], result[2], result[3], result[4], self.seq[result[0] ... result[0] + result[1]])
75 offset = match.pos + 1
78 return matches.empty? ? nil : matches unless block_given?
84 # Macro for matching nucleotides including ambiguity codes.
86 #define MATCH(A,B) ((bitmap[A] & bitmap[B]) != 0)
89 # Defining a struct for returning match information.
100 # Bitmap for matching nucleotides including ambiguity codes.
101 # For each value bits are set from the left: bit pos 1 for A,
102 # bit pos 2 for T, bit pos 3 for C, and bit pos 4 for G.
105 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
106 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
107 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
108 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
109 0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0,
110 0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0,
111 0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0,
112 0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0,
113 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
114 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
115 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
117 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
118 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
119 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
124 # Backtrack algorithm for matching a pattern (p) starting in a sequence (s) allowing for mis
125 # mismatches, ins insertions and del deletions. ss is the start of the sequence, used only for
126 # reporting the match endpoints.
129 char *ss, // Sequence start
132 unsigned int mis, // Max mismatches
133 unsigned int ins, // Max insertions
134 unsigned int del // Max deletions
137 match found = {0, 0, 0, 0};
140 while (*s && MATCH(*s, *p)) ++s, ++p; // OK to always match longest segment
144 f->end = (unsigned int) (s - ss);
153 if (mis && *s && *p && (f = backtrack(ss, s + 1, p + 1, mis - 1, ins, del))) return f;
154 if (ins && *p && (f = backtrack(ss, s, p + 1, mis, ins - 1, del))) return f;
155 if (del && *s && (f = backtrack(ss, s + 1, p, mis, ins, del - 1))) return f;
162 # Find pattern (p) in a sequence (s) starting at pos, with at most mis mismatches, ins
163 # insertions and del deletions.
166 VALUE _s, // Sequence
167 VALUE _len, // Sequence length
169 VALUE _pos, // Start position
170 VALUE _max_mis, // Maximum mismatches
171 VALUE _max_ins, // Maximum insertions
172 VALUE _max_del // Maximum deletions
175 char *s = StringValuePtr(_s);
176 char *p = StringValuePtr(_p);
177 unsigned int len = FIX2UINT(_len);
178 unsigned int pos = FIX2UINT(_pos);
179 unsigned int max_mis = FIX2UINT(_max_mis);
180 unsigned int max_ins = FIX2UINT(_max_ins);
181 unsigned int max_del = FIX2UINT(_max_del);
185 unsigned int mbeg = 0;
186 unsigned int mlen = 0;
187 unsigned int mmis = 0;
188 unsigned int mins = 0;
189 unsigned int mdel = 0;
198 if ((f = backtrack(ss, s, p, max_mis, max_ins, max_del)))
201 mlen = f->end - mbeg;
202 mmis = max_mis - f->mis;
203 mins = max_ins - f->ins;
204 mdel = max_del - f->del;
207 rb_ary_push(ary, INT2FIX(mbeg));
208 rb_ary_push(ary, INT2FIX(mlen));
209 rb_ary_push(ary, INT2FIX(mmis));
210 rb_ary_push(ary, INT2FIX(mins));
211 rb_ary_push(ary, INT2FIX(mdel));
224 # Class containing match information.
226 attr_reader :pos, :length, :mis, :ins, :del, :match
228 def initialize(pos, length, mis, ins, del, match)
238 "#{@pos}:#{@length}:#{@mis}:#{@ins}:#{@del}:#{@match}"