X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=code_ruby%2Flib%2Fmaasha%2Fseq%2Fbacktrack.rb;h=714cb226ef00dbaa642aaf10be7d0d1298a0113c;hb=3d51113401b585dc2fc813cbe67f8250e2bcfd43;hp=94d4d69c533ada11a447fbedf48aa972e16543ef;hpb=9ea3aebac833ba62708099cfd1f913ebe2c41561;p=biopieces.git diff --git a/code_ruby/lib/maasha/seq/backtrack.rb b/code_ruby/lib/maasha/seq/backtrack.rb index 94d4d69..714cb22 100644 --- a/code_ruby/lib/maasha/seq/backtrack.rb +++ b/code_ruby/lib/maasha/seq/backtrack.rb @@ -41,9 +41,9 @@ module BackTrack MAX_DEL = 5 # Maximum number of deletions allowed # ------------------------------------------------------------------------------ - # str.scan(pattern[, max_mismatches [, max_insertions [,max_deletions]]]) + # str.scan(pattern[, max_mismatches[, max_insertions[, max_deletions]]]) # -> Array - # str.scan(pattern[, max_mismatches [, max_insertions [,max_deletions]]]) { |match| + # str.scan(pattern[, max_mismatches[, max_insertions[, max_deletions]]]) { |match| # block # } # -> Match @@ -64,16 +64,18 @@ module BackTrack matches = [] - if block_given? - while result = self.track(pattern, offset, max_mismatches, max_insertions, max_deletions) - yield Match.new(result.first, result.last, self.seq[result.first ... result.first + result.last]) - offset = result.first + 1 - end - else - while result = self.track(pattern, offset, max_mismatches, max_insertions, max_deletions) - matches << Match.new(result.first, result.last, self.seq[result.first ... result.first + result.last]) - offset = result.first + 1 + while result = track_C(self.seq, self.length, pattern, offset, max_mismatches, max_insertions, max_deletions) + pos, length, mis, ins, del = result + + match = Match.new(pos, length, mis, ins, del, self.seq[pos ... pos + length]) + + if block_given? + yield match + else + matches << match end + + offset = match.pos + 1 end return matches.empty? ? nil : matches unless block_given? @@ -82,18 +84,27 @@ module BackTrack private inline do |builder| - builder.add_static "id_seq", 'rb_intern("@seq")', "ID" - # Macro for matching nucleotides including ambiguity codes. builder.prefix %{ - #define MATCH(A,B) ((bitmap[(unsigned short) A] & bitmap[(unsigned short) B]) != 0) + #define MATCH(A,B) ((bitmap[A] & bitmap[B]) != 0) + } + + # Defining a struct for returning match information. + builder.prefix %{ + typedef struct + { + unsigned int end; + unsigned int mis; + unsigned int ins; + unsigned int del; + } match; } # Bitmap for matching nucleotides including ambiguity codes. # For each value bits are set from the left: bit pos 1 for A, # bit pos 2 for T, bit pos 3 for C, and bit pos 4 for G. builder.prefix %{ - unsigned short bitmap[256] = { + char bitmap[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -113,75 +124,121 @@ module BackTrack }; } - # Backtrack algorithm for matching a pattern (p) starting in a sequence (s) allowing for mm + # Backtrack algorithm for matching a pattern (p) starting in a sequence (s) allowing for mis # mismatches, ins insertions and del deletions. ss is the start of the sequence, used only for # reporting the match endpoints. builder.prefix %{ - unsigned int backtrack(char* ss, char* s, char* p, int mm, int ins, int del) + match* backtrack( + char *ss, // Sequence start + char *s, // Sequence + char *p, // Pattern + unsigned int mis, // Max mismatches + unsigned int ins, // Max insertions + unsigned int del // Max deletions + ) { - unsigned int r = 0; - - while (*s && MATCH(*s, *p)) ++s, ++p; // OK to always match longest segment - - if (!*p) - return (unsigned int) (s - ss); - else - { - if (mm && *s && *p && (r = backtrack(ss, s + 1, p + 1, mm - 1, ins, del))) return r; - if (ins && *p && (r = backtrack(ss, s, p + 1, mm, ins - 1, del))) return r; - if (del && *s && (r = backtrack(ss, s + 1, p, mm, ins, del - 1))) return r; - } - - return 0; + match found = {0, 0, 0, 0}; + match *f = &found; + + while (*s && MATCH(*s, *p)) ++s, ++p; // OK to always match longest segment + + if (!*p) + { + f->end = (unsigned int) (s - ss); + f->mis = mis; + f->ins = ins; + f->del = del; + + return f; + } + else + { + if (mis && *s && *p && (f = backtrack(ss, s + 1, p + 1, mis - 1, ins, del))) return f; + if (ins && *p && (f = backtrack(ss, s, p + 1, mis, ins - 1, del))) return f; + if (del && *s && (f = backtrack(ss, s + 1, p, mis, ins, del - 1))) return f; + } + + return NULL; } } - # Find pattern (p) in a sequence (@seq) starting at pos, with at most mm mismatches, ins + # Find pattern (p) in a sequence (s) starting at pos, with at most mis mismatches, ins # insertions and del deletions. builder.c %{ - static VALUE track(char* p, unsigned int pos, int mm, int ins, int del) + VALUE track_C( + VALUE _s, // Sequence + VALUE _len, // Sequence length + VALUE _p, // Pattern + VALUE _pos, // Start position + VALUE _max_mis, // Maximum mismatches + VALUE _max_ins, // Maximum insertions + VALUE _max_del // Maximum deletions + ) { - VALUE seq = rb_ivar_get(self, id_seq); - char* s = StringValuePtr(seq); - char* ss = s; - unsigned int e; - VALUE tuple; - - if (pos < strlen(s)) + char *s = StringValuePtr(_s); + char *p = StringValuePtr(_p); + unsigned int len = FIX2UINT(_len); + unsigned int pos = FIX2UINT(_pos); + unsigned int max_mis = FIX2UINT(_max_mis); + unsigned int max_ins = FIX2UINT(_max_ins); + unsigned int max_del = FIX2UINT(_max_del); + + char *ss = s; + match *f = NULL; + unsigned int mbeg = 0; + unsigned int mlen = 0; + unsigned int mmis = 0; + unsigned int mins = 0; + unsigned int mdel = 0; + VALUE ary = 0; + + if (pos < len) + { + s += pos; + + while (*s) { - s += pos; - - while (*s) - { - if ((e = backtrack(ss, s, p, mm, ins, del))) - { - tuple = rb_ary_new(); - rb_ary_push(tuple, INT2FIX((int) (s - ss))); - rb_ary_push(tuple, INT2FIX((int) e - (s - ss))); - return tuple; - } - - s++; - } + if ((f = backtrack(ss, s, p, max_mis, max_ins, max_del))) + { + mbeg = (s - ss); + mlen = f->end - mbeg; + mmis = max_mis - f->mis; + mins = max_ins - f->ins; + mdel = max_del - f->del; + + ary = rb_ary_new(); + rb_ary_push(ary, INT2FIX(mbeg)); + rb_ary_push(ary, INT2FIX(mlen)); + rb_ary_push(ary, INT2FIX(mmis)); + rb_ary_push(ary, INT2FIX(mins)); + rb_ary_push(ary, INT2FIX(mdel)); + return ary; + } + + s++; } + } - return Qnil; - } + return Qnil; + } } end # Class containing match information. class Match - attr_reader :pos, :length, :match + attr_reader :pos, :length, :mis, :ins, :del, :match - def initialize(pos, length, match) + def initialize(pos, length, mis, ins, del, match) @pos = pos @length = length + @mis = mis + @ins = ins + @del = del @match = match end def to_s - "#{pos}:#{length}:#{match}" + "#{@pos}:#{@length}:#{@mis}:#{@ins}:#{@del}:#{@match}" end end end