# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
require 'inline' # RubyInline
+require 'maasha/seq/ambiguity'
# Error class for all exceptions to do with BackTrack.
class BackTrackError < StandardError; end
# Algorithm based on code kindly provided by j_random_hacker @ Stackoverflow:
# http://stackoverflow.com/questions/7557017/approximate-string-matching-using-backtracking/
module BackTrack
+ extend Ambiguity
+
OK_PATTERN = Regexp.new('^[bflsycwphqrimtnkvadegu]+$')
MAX_MIS = 5 # Maximum number of mismatches allowed
MAX_INS = 5 # Maximum number of insertions allowed
MAX_DEL = 5 # Maximum number of deletions allowed
# ------------------------------------------------------------------------------
- # str.patmatch(pattern[, start|[start, stop][, max_mis[, max_ins[, max_del]]]])
+ # str.patmatch(pattern[, options])
# -> Match
- # str.patmatch(pattern[, start|[start, stop][, max_mis[, max_ins[, max_del]]]]) { |match|
+ # str.patmatch(pattern[, options]) { |match|
# block
# }
# -> Match
#
+ # options:
+ # :start
+ # :stop
+ # :max_mismatches
+ # :max_insertions
+ # :max_deletions
+ #
# ------------------------------------------------------------------------------
# Method to iterate through a sequence from a given start position to the end of
# the sequence or to a given stop position to locate a pattern allowing for a
# maximum number of mismatches, insertions, and deletions. Insertions are
# nucleotides found in the pattern but not in the sequence. Deletions are
# nucleotides found in the sequence but not in the pattern.
- def patmatch(pattern, start = 0, max_mismatches = 0, max_insertions = 0, max_deletions = 0)
- self.patscan(pattern, start, max_mismatches, max_insertions, max_deletions) do |m|
+ def patmatch(pattern, options = {})
+ options[:start] ||= 0
+ options[:stop] ||= self.length - 1
+ options[:max_mismatches] ||= 0
+ options[:max_insertions] ||= 0
+ options[:max_deletions] ||= 0
+
+ self.patscan(pattern, options) do |m|
if block_given?
yield m
else
end
# ------------------------------------------------------------------------------
- # str.patscan(pattern[, start|[start, stop][, max_mis[, max_ins[, max_del]]]])
+ # str.patscan(pattern[, options])
# -> Array
- # str.patscan(pattern[, start|[start, stop][, max_mis[, max_ins[, max_del]]]]) { |match|
+ # str.patscan(pattern[, options]) { |match|
# block
# }
# -> Match
#
+ # options:
+ # :start
+ # :stop
+ # :max_mismatches
+ # :max_insertions
+ # :max_deletions
+ #
# ------------------------------------------------------------------------------
# Method to iterate through a sequence from a given start position to the end of
# the sequence or to a given stop position to locate a pattern allowing for a
# nucleotides found in the sequence but not in the pattern. Matches found in
# block context return the Match object. Otherwise matches are returned in an
# Array of Match objects.
- def patscan(pattern, start = 0, max_mismatches = 0, max_insertions = 0, max_deletions = 0)
- if start.is_a? Array
- start, stop = *start
- else
- stop = self.length - 1
- end
+ def patscan(pattern, options = {})
+ options[:start] ||= 0
+ options[:stop] ||= self.length - 1
+ options[:max_mismatches] ||= 0
+ options[:max_insertions] ||= 0
+ options[:max_deletions] ||= 0
raise BackTrackError, "Bad pattern: #{pattern}" unless pattern.downcase =~ OK_PATTERN
- raise BackTrackError, "start: #{start} out of range (0 .. #{self.length - 1})" unless (0 ... self.length).include? start
- raise BackTrackError, "stop: #{stop} out of range (0 .. #{self.length - 1})" unless (0 ... self.length).include? stop
- raise BackTrackError, "max_mismatches: #{max_mismatches} out of range (0 .. #{MAX_MIS})" unless (0 .. MAX_MIS).include? max_mismatches
- raise BackTrackError, "max_insertions: #{max_insertions} out of range (0 .. #{MAX_INS})" unless (0 .. MAX_INS).include? max_insertions
- raise BackTrackError, "max_deletions: #{max_deletions} out of range (0 .. #{MAX_DEL})" unless (0 .. MAX_DEL).include? max_deletions
+ raise BackTrackError, "start: #{options[:start]} out of range (0 .. #{self.length - 1})" unless (0 ... self.length).include? options[:start]
+ raise BackTrackError, "stop: #{options[:stop]} out of range (0 .. #{self.length - 1})" unless (0 ... self.length).include? options[:stop]
+ raise BackTrackError, "max_mismatches: #{options[:max_mismatches]} out of range (0 .. #{MAX_MIS})" unless (0 .. MAX_MIS).include? options[:max_mismatches]
+ raise BackTrackError, "max_insertions: #{options[:max_insertions]} out of range (0 .. #{MAX_INS})" unless (0 .. MAX_INS).include? options[:max_insertions]
+ raise BackTrackError, "max_deletions: #{options[:max_deletions]} out of range (0 .. #{MAX_DEL})" unless (0 .. MAX_DEL).include? options[:max_deletions]
matches = []
- while result = scan_C(self.seq, pattern, start, stop, max_mismatches, max_insertions, max_deletions)
+ while result = scan_C(self.seq,
+ pattern,
+ options[:start],
+ options[:stop],
+ options[:max_mismatches],
+ options[:max_insertions],
+ options[:max_deletions]
+ )
match = Match.new(result.first, result.last, self.seq[result.first ... result.first + result.last])
if block_given?
matches << match
end
- start = result.first + 1
+ options[:start] = result.first + 1
end
return matches.empty? ? nil : matches unless block_given?
private
inline do |builder|
- # Macro for matching nucleotides including ambiguity codes.
- builder.prefix %{
- #define MATCH(A,B) ((bitmap[A] & bitmap[B]) != 0)
- }
-
- # Bitmap for matching nucleotides including ambiguity codes.
- # For each value bits are set from the left: bit pos 1 for A,
- # bit pos 2 for T, bit pos 3 for C, and bit pos 4 for G.
- builder.prefix %{
- char bitmap[256] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0,
- 0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0,
- 0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0,
- 0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- };
- }
+ add_ambiguity_macro(builder)
# Backtrack algorithm for matching a pattern (p) starting in a sequence (s) allowing for mis
# mismatches, ins insertions and del deletions. ss is the start of the sequence, used only for
char *ss = s;
int state = 0;
+ unsigned int i = 0;
unsigned int e = 0;
VALUE tuple;
- if (start <= stop)
- {
- s += start;
+ s += start;
- while (*s)
+ for (i = start; i <= stop; i++, s++)
+ {
+ if ((e = backtrack(ss, s, p, mis, ins, del, state)))
{
- if ((e = backtrack(ss, s, p, mis, ins, del, state)))
- {
- tuple = rb_ary_new();
- rb_ary_push(tuple, INT2FIX((int) (s - ss)));
- rb_ary_push(tuple, INT2FIX((int) e - (s - ss)));
- return tuple;
- }
-
- s++;
+ tuple = rb_ary_new();
+ rb_ary_push(tuple, INT2FIX((int) (s - ss)));
+ rb_ary_push(tuple, INT2FIX((int) e - (s - ss)));
+ return tuple;
}
}