1 # Copyright (C) 2007-2011 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 require 'inline' # RubyInline
26 require 'maasha/seq/ambiguity'
28 # Error class for all exceptions to do with BackTrack.
29 class BackTrackError < StandardError; end
31 # Module containing code to locate nucleotide patterns in sequences allowing for
32 # ambiguity codes and a given maximum mismatches, insertions, and deletions. The
33 # pattern match engine is based on a backtrack algorithm.
34 # Insertions are nucleotides found in the pattern but not in the sequence.
35 # Deletions are nucleotides found in the sequence but not in the pattern.
36 # Algorithm based on code kindly provided by j_random_hacker @ Stackoverflow:
37 # http://stackoverflow.com/questions/7557017/approximate-string-matching-using-backtracking/
41 OK_PATTERN = Regexp.new('^[bflsycwphqrimtnkvadegu]+$')
42 MAX_MIS = 5 # Maximum number of mismatches allowed
43 MAX_INS = 5 # Maximum number of insertions allowed
44 MAX_DEL = 5 # Maximum number of deletions allowed
46 # ------------------------------------------------------------------------------
47 # str.patmatch(pattern[, options])
49 # str.patmatch(pattern[, options]) { |match|
61 # ------------------------------------------------------------------------------
62 # Method to iterate through a sequence from a given start position to the end of
63 # the sequence or to a given stop position to locate a pattern allowing for a
64 # maximum number of mismatches, insertions, and deletions. Insertions are
65 # nucleotides found in the pattern but not in the sequence. Deletions are
66 # nucleotides found in the sequence but not in the pattern.
67 def patmatch(pattern, options = {})
69 options[:stop] ||= self.length - 1
70 options[:max_mismatches] ||= 0
71 options[:max_insertions] ||= 0
72 options[:max_deletions] ||= 0
74 self.patscan(pattern, options) do |m|
91 # ------------------------------------------------------------------------------
92 # str.patscan(pattern[, options])
94 # str.patscan(pattern[, options]) { |match|
106 # ------------------------------------------------------------------------------
107 # Method to iterate through a sequence from a given start position to the end of
108 # the sequence or to a given stop position to locate a pattern allowing for a
109 # maximum number of mismatches, insertions, and deletions. Insertions are
110 # nucleotides found in the pattern but not in the sequence. Deletions are
111 # nucleotides found in the sequence but not in the pattern. Matches found in
112 # block context return the Match object. Otherwise matches are returned in an
113 # Array of Match objects.
114 def patscan(pattern, options = {})
115 options[:start] ||= 0
116 options[:stop] ||= self.length - 1
117 options[:max_mismatches] ||= 0
118 options[:max_insertions] ||= 0
119 options[:max_deletions] ||= 0
121 raise BackTrackError, "Bad pattern: #{pattern}" unless pattern.downcase =~ OK_PATTERN
122 raise BackTrackError, "start: #{options[:start]} out of range (0 .. #{self.length - 1})" unless (0 ... self.length).include? options[:start]
123 raise BackTrackError, "stop: #{options[:stop]} out of range (0 .. #{self.length - 1})" unless (0 ... self.length).include? options[:stop]
124 raise BackTrackError, "max_mismatches: #{options[:max_mismatches]} out of range (0 .. #{MAX_MIS})" unless (0 .. MAX_MIS).include? options[:max_mismatches]
125 raise BackTrackError, "max_insertions: #{options[:max_insertions]} out of range (0 .. #{MAX_INS})" unless (0 .. MAX_INS).include? options[:max_insertions]
126 raise BackTrackError, "max_deletions: #{options[:max_deletions]} out of range (0 .. #{MAX_DEL})" unless (0 .. MAX_DEL).include? options[:max_deletions]
130 while result = scan_C(self.seq,
134 options[:max_mismatches],
135 options[:max_insertions],
136 options[:max_deletions]
138 match = Match.new(result.first, result.last, self.seq[result.first ... result.first + result.last])
146 options[:start] = result.first + 1
149 return matches unless block_given?
155 add_ambiguity_macro(builder)
157 # Backtrack algorithm for matching a pattern (p) starting in a sequence (s) allowing for mis
158 # mismatches, ins insertions and del deletions. ss is the start of the sequence, used only for
159 # reporting the match endpoints. State is used to avoid ins followed by del and visa versa which
162 unsigned int backtrack(
163 char *ss, // Sequence start
166 unsigned int mis, // Max mismatches
167 unsigned int ins, // Max insertions
168 unsigned int del, // Max deletions
169 int state // Last event: mis, ins or del
174 while (*s && MATCH(*s, *p)) ++s, ++p; // OK to always match longest segment
177 return (unsigned int) (s - ss);
180 if (mis && *s && *p && (r = backtrack(ss, s + 1, p + 1, mis - 1, ins, del, 0))) return r;
181 if (ins && *p && (state != -1) && (r = backtrack(ss, s, p + 1, mis, ins - 1, del, 1))) return r;
182 if (del && *s && (state != 1) && (r = backtrack(ss, s + 1, p, mis, ins, del - 1, -1))) return r;
189 # Find pattern (p) in a sequence (s) starting at pos, with at most mis mismatches, ins
190 # insertions and del deletions.
193 VALUE _s, // Sequence
195 VALUE _start, // Search postition start
196 VALUE _stop, // Search position stop
197 VALUE _mis, // Maximum mismatches
198 VALUE _ins, // Maximum insertions
199 VALUE _del // Maximum deletions
202 char *s = StringValuePtr(_s);
203 char *p = StringValuePtr(_p);
204 unsigned int start = FIX2UINT(_start);
205 unsigned int stop = FIX2UINT(_stop);
206 unsigned int mis = FIX2UINT(_mis);
207 unsigned int ins = FIX2UINT(_ins);
208 unsigned int del = FIX2UINT(_del);
218 for (i = start; i <= stop; i++, s++)
220 if ((e = backtrack(ss, s, p, mis, ins, del, state)))
222 tuple = rb_ary_new();
223 rb_ary_push(tuple, INT2FIX((int) (s - ss)));
224 rb_ary_push(tuple, INT2FIX((int) e - (s - ss)));
234 # Class containing match information.
236 attr_reader :pos, :length, :match
238 def initialize(pos, length, match)
245 "#{pos}:#{length}:#{match}"