# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
require 'inline' # RubyInline
+require 'maasha/seq/ambiguity'
# Error class for all exceptions to do with BackTrack.
class BackTrackError < StandardError; end
# Algorithm based on code kindly provided by j_random_hacker @ Stackoverflow:
# http://stackoverflow.com/questions/7557017/approximate-string-matching-using-backtracking/
module BackTrack
+ extend Ambiguity
+
OK_PATTERN = Regexp.new('^[bflsycwphqrimtnkvadegu]+$')
MAX_MIS = 5 # Maximum number of mismatches allowed
MAX_INS = 5 # Maximum number of insertions allowed
MAX_DEL = 5 # Maximum number of deletions allowed
# ------------------------------------------------------------------------------
- # str.scan(pattern[, max_mismatches [, max_insertions [,max_deletions]]])
+ # str.patmatch(pattern[, start|[start, stop][, max_mis[, max_ins[, max_del]]]])
+ # -> Match
+ # str.patmatch(pattern[, start|[start, stop][, max_mis[, max_ins[, max_del]]]]) { |match|
+ # block
+ # }
+ # -> Match
+ #
+ # ------------------------------------------------------------------------------
+ # Method to iterate through a sequence from a given start position to the end of
+ # the sequence or to a given stop position to locate a pattern allowing for a
+ # maximum number of mismatches, insertions, and deletions. Insertions are
+ # nucleotides found in the pattern but not in the sequence. Deletions are
+ # nucleotides found in the sequence but not in the pattern.
+ def patmatch(pattern, start = 0, max_mismatches = 0, max_insertions = 0, max_deletions = 0)
+ self.patscan(pattern, start, max_mismatches, max_insertions, max_deletions) do |m|
+ if block_given?
+ yield m
+ else
+ return m
+ end
+
+ break
+ end
+
+ if block_given?
+ yield nil
+ else
+ return nil
+ end
+ end
+
+ # ------------------------------------------------------------------------------
+ # str.patscan(pattern[, start|[start, stop][, max_mis[, max_ins[, max_del]]]])
# -> Array
- # str.scan(pattern[, max_mismatches [, max_insertions [,max_deletions]]]) { |match|
+ # str.patscan(pattern[, start|[start, stop][, max_mis[, max_ins[, max_del]]]]) { |match|
# block
# }
# -> Match
#
# ------------------------------------------------------------------------------
- # Method to iterate through a sequence to locate pattern matches starting at a
- # given offset allowing for a maximum number of mismatches, insertions, and
- # deletions. Insertions are nucleotides found in the pattern but not in the
- # sequence. Deletions are nucleotides found in the sequence but not in the
- # pattern. Matches found in block context return the Match object. Otherwise
- # matches are returned in an Array of Match objects.
- def patscan(pattern, offset = 0, max_mismatches = 0, max_insertions = 0, max_deletions = 0)
+ # Method to iterate through a sequence from a given start position to the end of
+ # the sequence or to a given stop position to locate a pattern allowing for a
+ # maximum number of mismatches, insertions, and deletions. Insertions are
+ # nucleotides found in the pattern but not in the sequence. Deletions are
+ # nucleotides found in the sequence but not in the pattern. Matches found in
+ # block context return the Match object. Otherwise matches are returned in an
+ # Array of Match objects.
+ def patscan(pattern, start = 0, max_mismatches = 0, max_insertions = 0, max_deletions = 0)
+ if start.is_a? Array
+ start, stop = *start
+ else
+ stop = self.length - 1
+ end
+
raise BackTrackError, "Bad pattern: #{pattern}" unless pattern.downcase =~ OK_PATTERN
- raise BackTrackError, "offset: #{offset} out of range (0 ... #{self.length})" unless (0 ... self.length).include? offset
+ raise BackTrackError, "start: #{start} out of range (0 .. #{self.length - 1})" unless (0 ... self.length).include? start
+ raise BackTrackError, "stop: #{stop} out of range (0 .. #{self.length - 1})" unless (0 ... self.length).include? stop
raise BackTrackError, "max_mismatches: #{max_mismatches} out of range (0 .. #{MAX_MIS})" unless (0 .. MAX_MIS).include? max_mismatches
raise BackTrackError, "max_insertions: #{max_insertions} out of range (0 .. #{MAX_INS})" unless (0 .. MAX_INS).include? max_insertions
raise BackTrackError, "max_deletions: #{max_deletions} out of range (0 .. #{MAX_DEL})" unless (0 .. MAX_DEL).include? max_deletions
matches = []
- if block_given?
- while result = self.track(pattern, offset, max_mismatches, max_insertions, max_deletions)
- yield Match.new(result.first, result.last, self.seq[result.first ... result.first + result.last])
- offset = result.first + 1
- end
- else
- while result = self.track(pattern, offset, max_mismatches, max_insertions, max_deletions)
- matches << Match.new(result.first, result.last, self.seq[result.first ... result.first + result.last])
- offset = result.first + 1
+ while result = scan_C(self.seq, pattern, start, stop, max_mismatches, max_insertions, max_deletions)
+ match = Match.new(result.first, result.last, self.seq[result.first ... result.first + result.last])
+
+ if block_given?
+ yield match
+ else
+ matches << match
end
+
+ start = result.first + 1
end
return matches.empty? ? nil : matches unless block_given?
private
inline do |builder|
- builder.add_static "id_seq", 'rb_intern("@seq")', "ID"
+ add_ambiguity_macro(builder)
- # Macro for matching nucleotides including ambiguity codes.
- builder.prefix %{
- #define MATCH(A,B) ((bitmap[(unsigned short) A] & bitmap[(unsigned short) B]) != 0)
- }
-
- # Bitmap for matching nucleotides including ambiguity codes.
- # For each value bits are set from the left: bit pos 1 for A,
- # bit pos 2 for T, bit pos 3 for C, and bit pos 4 for G.
- builder.prefix %{
- unsigned short bitmap[256] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0,
- 0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0,
- 0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0,
- 0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- };
- }
-
- # Backtrack algorithm for matching a pattern (p) starting in a sequence (s) allowing for mm
+ # Backtrack algorithm for matching a pattern (p) starting in a sequence (s) allowing for mis
# mismatches, ins insertions and del deletions. ss is the start of the sequence, used only for
- # reporting the match endpoints.
+ # reporting the match endpoints. State is used to avoid ins followed by del and visa versa which
+ # are nonsense.
builder.prefix %{
- unsigned int backtrack(char* ss, char* s, char* p, int mm, int ins, int del)
+ unsigned int backtrack(
+ char *ss, // Sequence start
+ char *s, // Sequence
+ char *p, // Pattern
+ unsigned int mis, // Max mismatches
+ unsigned int ins, // Max insertions
+ unsigned int del, // Max deletions
+ int state // Last event: mis, ins or del
+ )
{
- unsigned int r = 0;
+ unsigned int r = 0;
- while (*s && MATCH(*s, *p)) ++s, ++p; // OK to always match longest segment
+ while (*s && MATCH(*s, *p)) ++s, ++p; // OK to always match longest segment
- if (!*p)
- return (unsigned int) (s - ss);
- else
- {
- if (mm && *s && *p && (r = backtrack(ss, s + 1, p + 1, mm - 1, ins, del))) return r;
- if (ins && *p && (r = backtrack(ss, s, p + 1, mm, ins - 1, del))) return r;
- if (del && *s && (r = backtrack(ss, s + 1, p, mm, ins, del - 1))) return r;
- }
+ if (!*p)
+ return (unsigned int) (s - ss);
+ else
+ {
+ if (mis && *s && *p && (r = backtrack(ss, s + 1, p + 1, mis - 1, ins, del, 0))) return r;
+ if (ins && *p && (state != -1) && (r = backtrack(ss, s, p + 1, mis, ins - 1, del, 1))) return r;
+ if (del && *s && (state != 1) && (r = backtrack(ss, s + 1, p, mis, ins, del - 1, -1))) return r;
+ }
- return 0;
+ return 0;
}
}
- # Find pattern (p) in a sequence (@seq) starting at pos, with at most mm mismatches, ins
+ # Find pattern (p) in a sequence (s) starting at pos, with at most mis mismatches, ins
# insertions and del deletions.
builder.c %{
- static VALUE track(char* p, unsigned int pos, int mm, int ins, int del)
+ VALUE scan_C(
+ VALUE _s, // Sequence
+ VALUE _p, // Pattern
+ VALUE _start, // Search postition start
+ VALUE _stop, // Search position stop
+ VALUE _mis, // Maximum mismatches
+ VALUE _ins, // Maximum insertions
+ VALUE _del // Maximum deletions
+ )
{
- VALUE seq = rb_ivar_get(self, id_seq);
- char* s = StringValuePtr(seq);
- char* ss = s;
- unsigned int e;
- VALUE tuple;
-
- if (pos < strlen(s))
+ char *s = StringValuePtr(_s);
+ char *p = StringValuePtr(_p);
+ unsigned int start = FIX2UINT(_start);
+ unsigned int stop = FIX2UINT(_stop);
+ unsigned int mis = FIX2UINT(_mis);
+ unsigned int ins = FIX2UINT(_ins);
+ unsigned int del = FIX2UINT(_del);
+
+ char *ss = s;
+ int state = 0;
+ unsigned int i = 0;
+ unsigned int e = 0;
+ VALUE tuple;
+
+ s += start;
+
+ for (i = start; i <= stop; i++, s++)
+ {
+ if ((e = backtrack(ss, s, p, mis, ins, del, state)))
{
- s += pos;
-
- while (*s)
- {
- if ((e = backtrack(ss, s, p, mm, ins, del)))
- {
- tuple = rb_ary_new();
- rb_ary_push(tuple, INT2FIX((int) (s - ss)));
- rb_ary_push(tuple, INT2FIX((int) e - (s - ss)));
- return tuple;
- }
-
- s++;
- }
+ tuple = rb_ary_new();
+ rb_ary_push(tuple, INT2FIX((int) (s - ss)));
+ rb_ary_push(tuple, INT2FIX((int) e - (s - ss)));
+ return tuple;
}
+ }
- return Qnil;
- }
+ return Qnil;
+ }
}
end