X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=code_ruby%2Flib%2Fmaasha%2Fseq%2Fbacktrack.rb;h=3e9a781d7bcad706877ccf77cb6b108d4686c47f;hb=8b6b8cb49facd3cb2f4c6f0d98e3bb84de9a6ddf;hp=94d4d69c533ada11a447fbedf48aa972e16543ef;hpb=9ea3aebac833ba62708099cfd1f913ebe2c41561;p=biopieces.git diff --git a/code_ruby/lib/maasha/seq/backtrack.rb b/code_ruby/lib/maasha/seq/backtrack.rb index 94d4d69..3e9a781 100644 --- a/code_ruby/lib/maasha/seq/backtrack.rb +++ b/code_ruby/lib/maasha/seq/backtrack.rb @@ -23,6 +23,7 @@ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< require 'inline' # RubyInline +require 'maasha/seq/ambiguity' # Error class for all exceptions to do with BackTrack. class BackTrackError < StandardError; end @@ -35,138 +36,198 @@ class BackTrackError < StandardError; end # Algorithm based on code kindly provided by j_random_hacker @ Stackoverflow: # http://stackoverflow.com/questions/7557017/approximate-string-matching-using-backtracking/ module BackTrack + extend Ambiguity + OK_PATTERN = Regexp.new('^[bflsycwphqrimtnkvadegu]+$') MAX_MIS = 5 # Maximum number of mismatches allowed MAX_INS = 5 # Maximum number of insertions allowed MAX_DEL = 5 # Maximum number of deletions allowed # ------------------------------------------------------------------------------ - # str.scan(pattern[, max_mismatches [, max_insertions [,max_deletions]]]) + # str.patmatch(pattern[, options]) + # -> Match + # str.patmatch(pattern[, options]) { |match| + # block + # } + # -> Match + # + # options: + # :start + # :stop + # :max_mismatches + # :max_insertions + # :max_deletions + # + # ------------------------------------------------------------------------------ + # Method to iterate through a sequence from a given start position to the end of + # the sequence or to a given stop position to locate a pattern allowing for a + # maximum number of mismatches, insertions, and deletions. Insertions are + # nucleotides found in the pattern but not in the sequence. Deletions are + # nucleotides found in the sequence but not in the pattern. + def patmatch(pattern, options = {}) + options[:start] ||= 0 + options[:stop] ||= self.length - 1 + options[:max_mismatches] ||= 0 + options[:max_insertions] ||= 0 + options[:max_deletions] ||= 0 + + self.patscan(pattern, options) do |m| + if block_given? + yield m + else + return m + end + + break + end + + if block_given? + yield nil + else + return nil + end + end + + # ------------------------------------------------------------------------------ + # str.patscan(pattern[, options]) # -> Array - # str.scan(pattern[, max_mismatches [, max_insertions [,max_deletions]]]) { |match| + # str.patscan(pattern[, options]) { |match| # block # } # -> Match # + # options: + # :start + # :stop + # :max_mismatches + # :max_insertions + # :max_deletions + # # ------------------------------------------------------------------------------ - # Method to iterate through a sequence to locate pattern matches starting at a - # given offset allowing for a maximum number of mismatches, insertions, and - # deletions. Insertions are nucleotides found in the pattern but not in the - # sequence. Deletions are nucleotides found in the sequence but not in the - # pattern. Matches found in block context return the Match object. Otherwise - # matches are returned in an Array of Match objects. - def patscan(pattern, offset = 0, max_mismatches = 0, max_insertions = 0, max_deletions = 0) + # Method to iterate through a sequence from a given start position to the end of + # the sequence or to a given stop position to locate a pattern allowing for a + # maximum number of mismatches, insertions, and deletions. Insertions are + # nucleotides found in the pattern but not in the sequence. Deletions are + # nucleotides found in the sequence but not in the pattern. Matches found in + # block context return the Match object. Otherwise matches are returned in an + # Array of Match objects. + def patscan(pattern, options = {}) + options[:start] ||= 0 + options[:stop] ||= self.length - 1 + options[:max_mismatches] ||= 0 + options[:max_insertions] ||= 0 + options[:max_deletions] ||= 0 + raise BackTrackError, "Bad pattern: #{pattern}" unless pattern.downcase =~ OK_PATTERN - raise BackTrackError, "offset: #{offset} out of range (0 ... #{self.length})" unless (0 ... self.length).include? offset - raise BackTrackError, "max_mismatches: #{max_mismatches} out of range (0 .. #{MAX_MIS})" unless (0 .. MAX_MIS).include? max_mismatches - raise BackTrackError, "max_insertions: #{max_insertions} out of range (0 .. #{MAX_INS})" unless (0 .. MAX_INS).include? max_insertions - raise BackTrackError, "max_deletions: #{max_deletions} out of range (0 .. #{MAX_DEL})" unless (0 .. MAX_DEL).include? max_deletions + raise BackTrackError, "start: #{options[:start]} out of range (0 .. #{self.length - 1})" unless (0 ... self.length).include? options[:start] + raise BackTrackError, "stop: #{options[:stop]} out of range (0 .. #{self.length - 1})" unless (0 ... self.length).include? options[:stop] + raise BackTrackError, "max_mismatches: #{options[:max_mismatches]} out of range (0 .. #{MAX_MIS})" unless (0 .. MAX_MIS).include? options[:max_mismatches] + raise BackTrackError, "max_insertions: #{options[:max_insertions]} out of range (0 .. #{MAX_INS})" unless (0 .. MAX_INS).include? options[:max_insertions] + raise BackTrackError, "max_deletions: #{options[:max_deletions]} out of range (0 .. #{MAX_DEL})" unless (0 .. MAX_DEL).include? options[:max_deletions] matches = [] - if block_given? - while result = self.track(pattern, offset, max_mismatches, max_insertions, max_deletions) - yield Match.new(result.first, result.last, self.seq[result.first ... result.first + result.last]) - offset = result.first + 1 - end - else - while result = self.track(pattern, offset, max_mismatches, max_insertions, max_deletions) - matches << Match.new(result.first, result.last, self.seq[result.first ... result.first + result.last]) - offset = result.first + 1 + while result = scan_C(self.seq, + pattern, + options[:start], + options[:stop], + options[:max_mismatches], + options[:max_insertions], + options[:max_deletions] + ) + match = Match.new(result.first, result.last, self.seq[result.first ... result.first + result.last]) + + if block_given? + yield match + else + matches << match end + + options[:start] = result.first + 1 end - return matches.empty? ? nil : matches unless block_given? + return matches unless block_given? end private inline do |builder| - builder.add_static "id_seq", 'rb_intern("@seq")', "ID" + add_ambiguity_macro(builder) - # Macro for matching nucleotides including ambiguity codes. - builder.prefix %{ - #define MATCH(A,B) ((bitmap[(unsigned short) A] & bitmap[(unsigned short) B]) != 0) - } - - # Bitmap for matching nucleotides including ambiguity codes. - # For each value bits are set from the left: bit pos 1 for A, - # bit pos 2 for T, bit pos 3 for C, and bit pos 4 for G. - builder.prefix %{ - unsigned short bitmap[256] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0, - 0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0, - 0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0, - 0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - }; - } - - # Backtrack algorithm for matching a pattern (p) starting in a sequence (s) allowing for mm + # Backtrack algorithm for matching a pattern (p) starting in a sequence (s) allowing for mis # mismatches, ins insertions and del deletions. ss is the start of the sequence, used only for - # reporting the match endpoints. + # reporting the match endpoints. State is used to avoid ins followed by del and visa versa which + # are nonsense. builder.prefix %{ - unsigned int backtrack(char* ss, char* s, char* p, int mm, int ins, int del) + unsigned int backtrack( + char *ss, // Sequence start + char *s, // Sequence + char *p, // Pattern + unsigned int mis, // Max mismatches + unsigned int ins, // Max insertions + unsigned int del, // Max deletions + int state // Last event: mis, ins or del + ) { - unsigned int r = 0; + unsigned int r = 0; - while (*s && MATCH(*s, *p)) ++s, ++p; // OK to always match longest segment + while (*s && MATCH(*s, *p)) ++s, ++p; // OK to always match longest segment - if (!*p) - return (unsigned int) (s - ss); - else - { - if (mm && *s && *p && (r = backtrack(ss, s + 1, p + 1, mm - 1, ins, del))) return r; - if (ins && *p && (r = backtrack(ss, s, p + 1, mm, ins - 1, del))) return r; - if (del && *s && (r = backtrack(ss, s + 1, p, mm, ins, del - 1))) return r; - } + if (!*p) + return (unsigned int) (s - ss); + else + { + if (mis && *s && *p && (r = backtrack(ss, s + 1, p + 1, mis - 1, ins, del, 0))) return r; + if (ins && *p && (state != -1) && (r = backtrack(ss, s, p + 1, mis, ins - 1, del, 1))) return r; + if (del && *s && (state != 1) && (r = backtrack(ss, s + 1, p, mis, ins, del - 1, -1))) return r; + } - return 0; + return 0; } } - # Find pattern (p) in a sequence (@seq) starting at pos, with at most mm mismatches, ins + # Find pattern (p) in a sequence (s) starting at pos, with at most mis mismatches, ins # insertions and del deletions. builder.c %{ - static VALUE track(char* p, unsigned int pos, int mm, int ins, int del) + VALUE scan_C( + VALUE _s, // Sequence + VALUE _p, // Pattern + VALUE _start, // Search postition start + VALUE _stop, // Search position stop + VALUE _mis, // Maximum mismatches + VALUE _ins, // Maximum insertions + VALUE _del // Maximum deletions + ) { - VALUE seq = rb_ivar_get(self, id_seq); - char* s = StringValuePtr(seq); - char* ss = s; - unsigned int e; - VALUE tuple; - - if (pos < strlen(s)) + char *s = StringValuePtr(_s); + char *p = StringValuePtr(_p); + unsigned int start = FIX2UINT(_start); + unsigned int stop = FIX2UINT(_stop); + unsigned int mis = FIX2UINT(_mis); + unsigned int ins = FIX2UINT(_ins); + unsigned int del = FIX2UINT(_del); + + char *ss = s; + int state = 0; + unsigned int i = 0; + unsigned int e = 0; + VALUE tuple; + + s += start; + + for (i = start; i <= stop; i++, s++) + { + if ((e = backtrack(ss, s, p, mis, ins, del, state))) { - s += pos; - - while (*s) - { - if ((e = backtrack(ss, s, p, mm, ins, del))) - { - tuple = rb_ary_new(); - rb_ary_push(tuple, INT2FIX((int) (s - ss))); - rb_ary_push(tuple, INT2FIX((int) e - (s - ss))); - return tuple; - } - - s++; - } + tuple = rb_ary_new(); + rb_ary_push(tuple, INT2FIX((int) (s - ss))); + rb_ary_push(tuple, INT2FIX((int) e - (s - ss))); + return tuple; } + } - return Qnil; - } + return Qnil; + } } end