1 # Copyright (C) 2007-2011 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 require 'inline' # RubyInline
26 require 'maasha/seq/ambiguity'
28 # Error class for all exceptions to do with BackTrack.
29 class BackTrackError < StandardError; end
31 # Module containing code to locate nucleotide patterns in sequences allowing for
32 # ambiguity codes and a given maximum mismatches, insertions, and deletions. The
33 # pattern match engine is based on a backtrack algorithm.
34 # Insertions are nucleotides found in the pattern but not in the sequence.
35 # Deletions are nucleotides found in the sequence but not in the pattern.
36 # Algorithm based on code kindly provided by j_random_hacker @ Stackoverflow:
37 # http://stackoverflow.com/questions/7557017/approximate-string-matching-using-backtracking/
41 OK_PATTERN = Regexp.new('^[bflsycwphqrimtnkvadegu]+$')
42 MAX_MIS = 5 # Maximum number of mismatches allowed
43 MAX_INS = 5 # Maximum number of insertions allowed
44 MAX_DEL = 5 # Maximum number of deletions allowed
46 # ------------------------------------------------------------------------------
47 # str.patmatch(pattern[, start|[start, stop][, max_mis[, max_ins[, max_del]]]])
49 # str.patmatch(pattern[, start|[start, stop][, max_mis[, max_ins[, max_del]]]]) { |match|
54 # ------------------------------------------------------------------------------
55 # Method to iterate through a sequence from a given start position to the end of
56 # the sequence or to a given stop position to locate a pattern allowing for a
57 # maximum number of mismatches, insertions, and deletions. Insertions are
58 # nucleotides found in the pattern but not in the sequence. Deletions are
59 # nucleotides found in the sequence but not in the pattern.
60 def patmatch(pattern, start = 0, max_mismatches = 0, max_insertions = 0, max_deletions = 0)
61 self.patscan(pattern, start, max_mismatches, max_insertions, max_deletions) do |m|
78 # ------------------------------------------------------------------------------
79 # str.patscan(pattern[, start|[start, stop][, max_mis[, max_ins[, max_del]]]])
81 # str.patscan(pattern[, start|[start, stop][, max_mis[, max_ins[, max_del]]]]) { |match|
86 # ------------------------------------------------------------------------------
87 # Method to iterate through a sequence from a given start position to the end of
88 # the sequence or to a given stop position to locate a pattern allowing for a
89 # maximum number of mismatches, insertions, and deletions. Insertions are
90 # nucleotides found in the pattern but not in the sequence. Deletions are
91 # nucleotides found in the sequence but not in the pattern. Matches found in
92 # block context return the Match object. Otherwise matches are returned in an
93 # Array of Match objects.
94 def patscan(pattern, start = 0, max_mismatches = 0, max_insertions = 0, max_deletions = 0)
98 stop = self.length - 1
101 raise BackTrackError, "Bad pattern: #{pattern}" unless pattern.downcase =~ OK_PATTERN
102 raise BackTrackError, "start: #{start} out of range (0 .. #{self.length - 1})" unless (0 ... self.length).include? start
103 raise BackTrackError, "stop: #{stop} out of range (0 .. #{self.length - 1})" unless (0 ... self.length).include? stop
104 raise BackTrackError, "max_mismatches: #{max_mismatches} out of range (0 .. #{MAX_MIS})" unless (0 .. MAX_MIS).include? max_mismatches
105 raise BackTrackError, "max_insertions: #{max_insertions} out of range (0 .. #{MAX_INS})" unless (0 .. MAX_INS).include? max_insertions
106 raise BackTrackError, "max_deletions: #{max_deletions} out of range (0 .. #{MAX_DEL})" unless (0 .. MAX_DEL).include? max_deletions
110 while result = scan_C(self.seq, pattern, start, stop, max_mismatches, max_insertions, max_deletions)
111 match = Match.new(result.first, result.last, self.seq[result.first ... result.first + result.last])
119 start = result.first + 1
122 return matches.empty? ? nil : matches unless block_given?
128 add_ambiguity_macro(builder)
130 # Backtrack algorithm for matching a pattern (p) starting in a sequence (s) allowing for mis
131 # mismatches, ins insertions and del deletions. ss is the start of the sequence, used only for
132 # reporting the match endpoints. State is used to avoid ins followed by del and visa versa which
135 unsigned int backtrack(
136 char *ss, // Sequence start
139 unsigned int mis, // Max mismatches
140 unsigned int ins, // Max insertions
141 unsigned int del, // Max deletions
142 int state // Last event: mis, ins or del
147 while (*s && MATCH(*s, *p)) ++s, ++p; // OK to always match longest segment
150 return (unsigned int) (s - ss);
153 if (mis && *s && *p && (r = backtrack(ss, s + 1, p + 1, mis - 1, ins, del, 0))) return r;
154 if (ins && *p && (state != -1) && (r = backtrack(ss, s, p + 1, mis, ins - 1, del, 1))) return r;
155 if (del && *s && (state != 1) && (r = backtrack(ss, s + 1, p, mis, ins, del - 1, -1))) return r;
162 # Find pattern (p) in a sequence (s) starting at pos, with at most mis mismatches, ins
163 # insertions and del deletions.
166 VALUE _s, // Sequence
168 VALUE _start, // Search postition start
169 VALUE _stop, // Search position stop
170 VALUE _mis, // Maximum mismatches
171 VALUE _ins, // Maximum insertions
172 VALUE _del // Maximum deletions
175 char *s = StringValuePtr(_s);
176 char *p = StringValuePtr(_p);
177 unsigned int start = FIX2UINT(_start);
178 unsigned int stop = FIX2UINT(_stop);
179 unsigned int mis = FIX2UINT(_mis);
180 unsigned int ins = FIX2UINT(_ins);
181 unsigned int del = FIX2UINT(_del);
191 for (i = start; i <= stop; i++, s++)
193 if ((e = backtrack(ss, s, p, mis, ins, del, state)))
195 tuple = rb_ary_new();
196 rb_ary_push(tuple, INT2FIX((int) (s - ss)));
197 rb_ary_push(tuple, INT2FIX((int) e - (s - ss)));
207 # Class containing match information.
209 attr_reader :pos, :length, :match
211 def initialize(pos, length, match)
218 "#{pos}:#{length}:#{match}"