MAX_DEL = 5 # Maximum number of deletions allowed
# ------------------------------------------------------------------------------
- # str.scan(pattern[, max_mismatches [, max_insertions [,max_deletions]]])
+ # str.scan(pattern[, max_mismatches[, max_insertions[, max_deletions]]])
# -> Array
- # str.scan(pattern[, max_mismatches [, max_insertions [,max_deletions]]]) { |match|
+ # str.scan(pattern[, max_mismatches[, max_insertions[, max_deletions]]]) { |match|
# block
# }
# -> Match
matches = []
- if block_given?
- while result = self.track(pattern, offset, max_mismatches, max_insertions, max_deletions)
- yield Match.new(result.first, result.last, self.seq[result.first ... result.first + result.last])
- offset = result.first + 1
- end
- else
- while result = self.track(pattern, offset, max_mismatches, max_insertions, max_deletions)
- matches << Match.new(result.first, result.last, self.seq[result.first ... result.first + result.last])
- offset = result.first + 1
+ while result = track_C(self.seq, self.length, pattern, offset, max_mismatches, max_insertions, max_deletions)
+ pos, length, mis, ins, del = result
+
+ match = Match.new(pos, length, mis, ins, del, self.seq[pos ... pos + length])
+
+ if block_given?
+ yield match
+ else
+ matches << match
end
+
+ offset = match.pos + 1
end
return matches.empty? ? nil : matches unless block_given?
private
inline do |builder|
- builder.add_static "id_seq", 'rb_intern("@seq")', "ID"
-
# Macro for matching nucleotides including ambiguity codes.
builder.prefix %{
- #define MATCH(A,B) ((bitmap[(unsigned short) A] & bitmap[(unsigned short) B]) != 0)
+ #define MATCH(A,B) ((bitmap[A] & bitmap[B]) != 0)
+ }
+
+ # Defining a struct for returning match information.
+ builder.prefix %{
+ typedef struct
+ {
+ unsigned int end;
+ unsigned int mis;
+ unsigned int ins;
+ unsigned int del;
+ } match;
}
# Bitmap for matching nucleotides including ambiguity codes.
# For each value bits are set from the left: bit pos 1 for A,
# bit pos 2 for T, bit pos 3 for C, and bit pos 4 for G.
builder.prefix %{
- unsigned short bitmap[256] = {
+ char bitmap[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
}
- # Backtrack algorithm for matching a pattern (p) starting in a sequence (s) allowing for mm
+ # Backtrack algorithm for matching a pattern (p) starting in a sequence (s) allowing for mis
# mismatches, ins insertions and del deletions. ss is the start of the sequence, used only for
# reporting the match endpoints.
builder.prefix %{
- unsigned int backtrack(char* ss, char* s, char* p, int mm, int ins, int del)
+ match* backtrack(
+ char *ss, // Sequence start
+ char *s, // Sequence
+ char *p, // Pattern
+ unsigned int mis, // Max mismatches
+ unsigned int ins, // Max insertions
+ unsigned int del // Max deletions
+ )
{
- unsigned int r = 0;
-
- while (*s && MATCH(*s, *p)) ++s, ++p; // OK to always match longest segment
-
- if (!*p)
- return (unsigned int) (s - ss);
- else
- {
- if (mm && *s && *p && (r = backtrack(ss, s + 1, p + 1, mm - 1, ins, del))) return r;
- if (ins && *p && (r = backtrack(ss, s, p + 1, mm, ins - 1, del))) return r;
- if (del && *s && (r = backtrack(ss, s + 1, p, mm, ins, del - 1))) return r;
- }
-
- return 0;
+ match found = {0, 0, 0, 0};
+ match *f = &found;
+
+ while (*s && MATCH(*s, *p)) ++s, ++p; // OK to always match longest segment
+
+ if (!*p)
+ {
+ f->end = (unsigned int) (s - ss);
+ f->mis = mis;
+ f->ins = ins;
+ f->del = del;
+
+ return f;
+ }
+ else
+ {
+ if (mis && *s && *p && (f = backtrack(ss, s + 1, p + 1, mis - 1, ins, del))) return f;
+ if (ins && *p && (f = backtrack(ss, s, p + 1, mis, ins - 1, del))) return f;
+ if (del && *s && (f = backtrack(ss, s + 1, p, mis, ins, del - 1))) return f;
+ }
+
+ return NULL;
}
}
- # Find pattern (p) in a sequence (@seq) starting at pos, with at most mm mismatches, ins
+ # Find pattern (p) in a sequence (s) starting at pos, with at most mis mismatches, ins
# insertions and del deletions.
builder.c %{
- static VALUE track(char* p, unsigned int pos, int mm, int ins, int del)
+ VALUE track_C(
+ VALUE _s, // Sequence
+ VALUE _len, // Sequence length
+ VALUE _p, // Pattern
+ VALUE _pos, // Start position
+ VALUE _max_mis, // Maximum mismatches
+ VALUE _max_ins, // Maximum insertions
+ VALUE _max_del // Maximum deletions
+ )
{
- VALUE seq = rb_ivar_get(self, id_seq);
- char* s = StringValuePtr(seq);
- char* ss = s;
- unsigned int e;
- VALUE tuple;
-
- if (pos < strlen(s))
+ char *s = StringValuePtr(_s);
+ char *p = StringValuePtr(_p);
+ unsigned int len = FIX2UINT(_len);
+ unsigned int pos = FIX2UINT(_pos);
+ unsigned int max_mis = FIX2UINT(_max_mis);
+ unsigned int max_ins = FIX2UINT(_max_ins);
+ unsigned int max_del = FIX2UINT(_max_del);
+
+ char *ss = s;
+ match *f = NULL;
+ unsigned int mbeg = 0;
+ unsigned int mlen = 0;
+ unsigned int mmis = 0;
+ unsigned int mins = 0;
+ unsigned int mdel = 0;
+ VALUE ary = 0;
+
+ if (pos < len)
+ {
+ s += pos;
+
+ while (*s)
{
- s += pos;
-
- while (*s)
- {
- if ((e = backtrack(ss, s, p, mm, ins, del)))
- {
- tuple = rb_ary_new();
- rb_ary_push(tuple, INT2FIX((int) (s - ss)));
- rb_ary_push(tuple, INT2FIX((int) e - (s - ss)));
- return tuple;
- }
-
- s++;
- }
+ if ((f = backtrack(ss, s, p, max_mis, max_ins, max_del)))
+ {
+ mbeg = (s - ss);
+ mlen = f->end - mbeg;
+ mmis = max_mis - f->mis;
+ mins = max_ins - f->ins;
+ mdel = max_del - f->del;
+
+ ary = rb_ary_new();
+ rb_ary_push(ary, INT2FIX(mbeg));
+ rb_ary_push(ary, INT2FIX(mlen));
+ rb_ary_push(ary, INT2FIX(mmis));
+ rb_ary_push(ary, INT2FIX(mins));
+ rb_ary_push(ary, INT2FIX(mdel));
+ return ary;
+ }
+
+ s++;
}
+ }
- return Qnil;
- }
+ return Qnil;
+ }
}
end
# Class containing match information.
class Match
- attr_reader :pos, :length, :match
+ attr_reader :pos, :length, :mis, :ins, :del, :match
- def initialize(pos, length, match)
+ def initialize(pos, length, mis, ins, del, match)
@pos = pos
@length = length
+ @mis = mis
+ @ins = ins
+ @del = del
@match = match
end
def to_s
- "#{pos}:#{length}:#{match}"
+ "#{@pos}:#{@length}:#{@mis}:#{@ins}:#{@del}:#{@match}"
end
end
end