From: martinahansen Date: Wed, 5 Dec 2012 10:20:05 +0000 (+0000) Subject: renamed patternmatcher to dynamic X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=ad496507a37910273c672e4a4155545bf00d58dd;p=biopieces.git renamed patternmatcher to dynamic git-svn-id: http://biopieces.googlecode.com/svn/trunk@2031 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/code_ruby/lib/maasha/seq/backtrack.rb b/code_ruby/lib/maasha/seq/backtrack.rb index 1d09c30..ab2c00f 100644 --- a/code_ruby/lib/maasha/seq/backtrack.rb +++ b/code_ruby/lib/maasha/seq/backtrack.rb @@ -40,6 +40,20 @@ module BackTrack MAX_INS = 5 # Maximum number of insertions allowed MAX_DEL = 5 # Maximum number of deletions allowed + # ------------------------------------------------------------------------------ + # str.patmatch(pattern[, start|[start, stop][, max_mis[, max_ins[, max_del]]]]) + # -> Match + # str.patmatch(pattern[, start|[start, stop][, max_mis[, max_ins[, max_del]]]]) { |match| + # block + # } + # -> Match + # + # ------------------------------------------------------------------------------ + # Method to iterate through a sequence from a given start position to the end of + # the sequence or to a given stop position to locate a pattern allowing for a + # maximum number of mismatches, insertions, and deletions. Insertions are + # nucleotides found in the pattern but not in the sequence. Deletions are + # nucleotides found in the sequence but not in the pattern. def patmatch(pattern, start = 0, max_mismatches = 0, max_insertions = 0, max_deletions = 0) self.patscan(pattern, start, max_mismatches, max_insertions, max_deletions) do |m| if block_given? @@ -59,9 +73,9 @@ module BackTrack end # ------------------------------------------------------------------------------ - # str.scan(pattern[, start|[start, stop][, max_mismatches[, max_insertions[, max_deletions]]]]) + # str.patscan(pattern[, start|[start, stop][, max_mis[, max_ins[, max_del]]]]) # -> Array - # str.scan(pattern[, start|[start, stop][, max_mismatches[, max_insertions[, max_deletions]]]]) { |match| + # str.patscan(pattern[, start|[start, stop][, max_mis[, max_ins[, max_del]]]]) { |match| # block # } # -> Match @@ -82,11 +96,11 @@ module BackTrack end raise BackTrackError, "Bad pattern: #{pattern}" unless pattern.downcase =~ OK_PATTERN -# raise BackTrackError, "start: #{start} out of range (0 .. #{self.length - 1})" unless (0 .. self.length).include? start -# raise BackTrackError, "stop: #{stop} out of range (0 .. #{self.length - 1})" unless (0 .. self.length).include? stop -# raise BackTrackError, "max_mismatches: #{max_mismatches} out of range (0 .. #{MAX_MIS})" unless (0 .. MAX_MIS).include? max_mismatches -# raise BackTrackError, "max_insertions: #{max_insertions} out of range (0 .. #{MAX_INS})" unless (0 .. MAX_INS).include? max_insertions -# raise BackTrackError, "max_deletions: #{max_deletions} out of range (0 .. #{MAX_DEL})" unless (0 .. MAX_DEL).include? max_deletions + raise BackTrackError, "start: #{start} out of range (0 .. #{self.length - 1})" unless (0 .. self.length).include? start + raise BackTrackError, "stop: #{stop} out of range (0 .. #{self.length - 1})" unless (0 .. self.length).include? stop + raise BackTrackError, "max_mismatches: #{max_mismatches} out of range (0 .. #{MAX_MIS})" unless (0 .. MAX_MIS).include? max_mismatches + raise BackTrackError, "max_insertions: #{max_insertions} out of range (0 .. #{MAX_INS})" unless (0 .. MAX_INS).include? max_insertions + raise BackTrackError, "max_deletions: #{max_deletions} out of range (0 .. #{MAX_DEL})" unless (0 .. MAX_DEL).include? max_deletions matches = [] diff --git a/code_ruby/lib/maasha/seq/dynamic.rb b/code_ruby/lib/maasha/seq/dynamic.rb new file mode 100644 index 0000000..6bb4294 --- /dev/null +++ b/code_ruby/lib/maasha/seq/dynamic.rb @@ -0,0 +1,282 @@ +# Copyright (C) 2007-2011 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This software is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +require 'inline' + +# Module containing code to locate nucleotide patterns in sequences allowing for +# ambiguity codes and a given maximum edit distance. +# Insertions are nucleotides found in the pattern but not in the sequence. +# Deletions are nucleotides found in the sequence but not in the pattern. +# +# Inspired by the paper by Bruno Woltzenlogel Paleo (page 197): +# http://www.logic.at/people/bruno/Papers/2007-GATE-ESSLLI.pdf +module PatternMatcher + # ------------------------------------------------------------------------------ + # str.patmatch(pattern[, pos[, max_edit_distance]]) + # -> Match or nil + # str.patscan(pattern[, pos[, max_edit_distance]]) { |match| + # block + # } + # -> Match + # + # ------------------------------------------------------------------------------ + # Method to iterate through a sequence to locate the first pattern match + # starting from a given position and allowing for a maximum edit distance. + def patmatch(pattern, pos = 0, max_edit_distance = 0) + self.patscan(pattern, pos, max_edit_distance) do |m| + return m + end + end + + # ------------------------------------------------------------------------------ + # str.patscan(pattern[, pos[, max_edit_distance]]) + # -> Array or nil + # str.patscan(pattern[, pos[, max_edit_distance]]) { |match| + # block + # } + # -> Match + # + # ------------------------------------------------------------------------------ + # Method to iterate through a sequence to locate pattern matches starting from a + # given position and allowing for a maximum edit distance. Matches found in + # block context return the Match object. Otherwise matches are returned in an + # Array. + def patscan(pattern, pos = 0, max_edit_distance = 0) + matches = [] + + while result = match_C(self.seq, self.length, pattern, pattern.length, pos, max_edit_distance) + match = Match.new(*result, self.seq[result[0] ... result[0] + result[1]]); + + if block_given? + yield match + else + matches << match + end + + pos = match.beg + 1 + end + + return matches unless block_given? + end + + private + + inline do |builder| + # Macro for matching nucleotides including ambiguity codes. + builder.prefix %{ + #define MAX_PAT 1024 + } + + builder.prefix %{ + #define MATCH(A,B) ((bitmap[A] & bitmap[B]) != 0) + } + + builder.prefix %{ + typedef struct + { + unsigned int mis; + unsigned int ins; + unsigned int del; + unsigned int ed; + } score; + } + + # Bitmap for matching nucleotides including ambiguity codes. + # For each value bits are set from the left: bit pos 1 for A, + # bit pos 2 for T, bit pos 3 for C, and bit pos 4 for G. + builder.prefix %{ + char bitmap[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0, + 0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0, + 0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0, + 0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + } + + builder.prefix %{ + void vector_init(score *vec, unsigned int vec_len) + { + unsigned int i = 0; + + for (i = 1; i < vec_len; i++) + { + vec[i].ins = i; + vec[i].ed = i; + } + } + } + + builder.prefix %{ + void vector_print(score *vec, unsigned int vec_len) + { + unsigned int i = 0; + + for (i = 0; i < vec_len; i++) + { + printf("i: %d mis: %d ins: %d del: %d ed: %d\\n", i, vec[i].mis, vec[i].ins, vec[i].del, vec[i].ed); + } + + printf("---\\n"); + } + } + + builder.prefix %{ + int match_found(score *vec, unsigned int pat_len, unsigned int max_ed) + { + return (vec[pat_len].ed <= max_ed); + } + } + + builder.prefix %{ + void vector_update(score *vec, char *seq, char *pat, unsigned int pat_len, unsigned int pos) + { + score diag = vec[0]; + score up = {0, 0, 0, 0}; // insertion + score left = vec[1]; // deletion + score new = {0, 0, 0, 0}; + + unsigned int i = 0; + + for (i = 0; i < pat_len; i++) + { + if (MATCH(seq[pos], pat[i])) // match + { + new = diag; + } + else + { + if (left.ed <= diag.ed && left.ed <= up.ed) // deletion + { + new = left; + new.del++; + } + else if (diag.ed <= up.ed && diag.ed <= left.ed) // mismatch + { + new = diag; + new.mis++; + } + else if (up.ed <= diag.ed && up.ed <= left.ed) // insertion + { + new = up; + new.ins++; + } + else + { + printf("This should not happen\\n"); + exit(1); + } + + new.ed++; + } + + diag = vec[i + 1]; + up = new; + left = vec[i + 2]; + + vec[i + 1] = new; + } + } + } + + builder.c %{ + VALUE match_C( + VALUE _seq, // Sequence + VALUE _seq_len, // Sequence length + VALUE _pat, // Pattern + VALUE _pat_len, // Pattern length + VALUE _pos, // Offset position + VALUE _max_ed // Maximum edit distance + ) + { + char *seq = (char *) StringValuePtr(_seq); + char *pat = (char *) StringValuePtr(_pat); + unsigned int seq_len = FIX2UINT(_seq_len); + unsigned int pat_len = FIX2UINT(_pat_len); + unsigned int pos = FIX2UINT(_pos); + unsigned int max_ed = FIX2UINT(_max_ed); + + score vec[MAX_PAT] = {0}; + unsigned int vec_len = pat_len + 1; + unsigned int match_beg = 0; + unsigned int match_len = 0; + + VALUE match_ary; + + vector_init(vec, vec_len); + + while (pos < seq_len) + { + vector_update(vec, seq, pat, pat_len, pos); + + if (match_found(vec, pat_len, max_ed)) + { + match_len = pat_len - vec[pat_len].ins + vec[pat_len].del; + match_beg = pos - match_len + 1; + + match_ary = rb_ary_new(); + rb_ary_push(match_ary, INT2FIX(match_beg)); + rb_ary_push(match_ary, INT2FIX(match_len)); + rb_ary_push(match_ary, INT2FIX(vec[pat_len].mis)); + rb_ary_push(match_ary, INT2FIX(vec[pat_len].ins)); + rb_ary_push(match_ary, INT2FIX(vec[pat_len].del)); + + return match_ary; + } + + pos++; + } + + return Qfalse; // no match + } + } + end + + class Match + attr_accessor :beg, :length, :mis, :ins, :del, :match + + def initialize(beg, length, mis, ins, del, match) + @beg = beg + @length = length + @mis = mis + @ins = ins + @del = del + @match = match + end + end +end + + +__END__ diff --git a/code_ruby/lib/maasha/seq/patternmatcher.rb b/code_ruby/lib/maasha/seq/patternmatcher.rb deleted file mode 100644 index 6bb4294..0000000 --- a/code_ruby/lib/maasha/seq/patternmatcher.rb +++ /dev/null @@ -1,282 +0,0 @@ -# Copyright (C) 2007-2011 Martin A. Hansen. - -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - -# http://www.gnu.org/copyleft/gpl.html - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -# This software is part of the Biopieces framework (www.biopieces.org). - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -require 'inline' - -# Module containing code to locate nucleotide patterns in sequences allowing for -# ambiguity codes and a given maximum edit distance. -# Insertions are nucleotides found in the pattern but not in the sequence. -# Deletions are nucleotides found in the sequence but not in the pattern. -# -# Inspired by the paper by Bruno Woltzenlogel Paleo (page 197): -# http://www.logic.at/people/bruno/Papers/2007-GATE-ESSLLI.pdf -module PatternMatcher - # ------------------------------------------------------------------------------ - # str.patmatch(pattern[, pos[, max_edit_distance]]) - # -> Match or nil - # str.patscan(pattern[, pos[, max_edit_distance]]) { |match| - # block - # } - # -> Match - # - # ------------------------------------------------------------------------------ - # Method to iterate through a sequence to locate the first pattern match - # starting from a given position and allowing for a maximum edit distance. - def patmatch(pattern, pos = 0, max_edit_distance = 0) - self.patscan(pattern, pos, max_edit_distance) do |m| - return m - end - end - - # ------------------------------------------------------------------------------ - # str.patscan(pattern[, pos[, max_edit_distance]]) - # -> Array or nil - # str.patscan(pattern[, pos[, max_edit_distance]]) { |match| - # block - # } - # -> Match - # - # ------------------------------------------------------------------------------ - # Method to iterate through a sequence to locate pattern matches starting from a - # given position and allowing for a maximum edit distance. Matches found in - # block context return the Match object. Otherwise matches are returned in an - # Array. - def patscan(pattern, pos = 0, max_edit_distance = 0) - matches = [] - - while result = match_C(self.seq, self.length, pattern, pattern.length, pos, max_edit_distance) - match = Match.new(*result, self.seq[result[0] ... result[0] + result[1]]); - - if block_given? - yield match - else - matches << match - end - - pos = match.beg + 1 - end - - return matches unless block_given? - end - - private - - inline do |builder| - # Macro for matching nucleotides including ambiguity codes. - builder.prefix %{ - #define MAX_PAT 1024 - } - - builder.prefix %{ - #define MATCH(A,B) ((bitmap[A] & bitmap[B]) != 0) - } - - builder.prefix %{ - typedef struct - { - unsigned int mis; - unsigned int ins; - unsigned int del; - unsigned int ed; - } score; - } - - # Bitmap for matching nucleotides including ambiguity codes. - # For each value bits are set from the left: bit pos 1 for A, - # bit pos 2 for T, bit pos 3 for C, and bit pos 4 for G. - builder.prefix %{ - char bitmap[256] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0, - 0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0, - 0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0, - 0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - }; - } - - builder.prefix %{ - void vector_init(score *vec, unsigned int vec_len) - { - unsigned int i = 0; - - for (i = 1; i < vec_len; i++) - { - vec[i].ins = i; - vec[i].ed = i; - } - } - } - - builder.prefix %{ - void vector_print(score *vec, unsigned int vec_len) - { - unsigned int i = 0; - - for (i = 0; i < vec_len; i++) - { - printf("i: %d mis: %d ins: %d del: %d ed: %d\\n", i, vec[i].mis, vec[i].ins, vec[i].del, vec[i].ed); - } - - printf("---\\n"); - } - } - - builder.prefix %{ - int match_found(score *vec, unsigned int pat_len, unsigned int max_ed) - { - return (vec[pat_len].ed <= max_ed); - } - } - - builder.prefix %{ - void vector_update(score *vec, char *seq, char *pat, unsigned int pat_len, unsigned int pos) - { - score diag = vec[0]; - score up = {0, 0, 0, 0}; // insertion - score left = vec[1]; // deletion - score new = {0, 0, 0, 0}; - - unsigned int i = 0; - - for (i = 0; i < pat_len; i++) - { - if (MATCH(seq[pos], pat[i])) // match - { - new = diag; - } - else - { - if (left.ed <= diag.ed && left.ed <= up.ed) // deletion - { - new = left; - new.del++; - } - else if (diag.ed <= up.ed && diag.ed <= left.ed) // mismatch - { - new = diag; - new.mis++; - } - else if (up.ed <= diag.ed && up.ed <= left.ed) // insertion - { - new = up; - new.ins++; - } - else - { - printf("This should not happen\\n"); - exit(1); - } - - new.ed++; - } - - diag = vec[i + 1]; - up = new; - left = vec[i + 2]; - - vec[i + 1] = new; - } - } - } - - builder.c %{ - VALUE match_C( - VALUE _seq, // Sequence - VALUE _seq_len, // Sequence length - VALUE _pat, // Pattern - VALUE _pat_len, // Pattern length - VALUE _pos, // Offset position - VALUE _max_ed // Maximum edit distance - ) - { - char *seq = (char *) StringValuePtr(_seq); - char *pat = (char *) StringValuePtr(_pat); - unsigned int seq_len = FIX2UINT(_seq_len); - unsigned int pat_len = FIX2UINT(_pat_len); - unsigned int pos = FIX2UINT(_pos); - unsigned int max_ed = FIX2UINT(_max_ed); - - score vec[MAX_PAT] = {0}; - unsigned int vec_len = pat_len + 1; - unsigned int match_beg = 0; - unsigned int match_len = 0; - - VALUE match_ary; - - vector_init(vec, vec_len); - - while (pos < seq_len) - { - vector_update(vec, seq, pat, pat_len, pos); - - if (match_found(vec, pat_len, max_ed)) - { - match_len = pat_len - vec[pat_len].ins + vec[pat_len].del; - match_beg = pos - match_len + 1; - - match_ary = rb_ary_new(); - rb_ary_push(match_ary, INT2FIX(match_beg)); - rb_ary_push(match_ary, INT2FIX(match_len)); - rb_ary_push(match_ary, INT2FIX(vec[pat_len].mis)); - rb_ary_push(match_ary, INT2FIX(vec[pat_len].ins)); - rb_ary_push(match_ary, INT2FIX(vec[pat_len].del)); - - return match_ary; - } - - pos++; - } - - return Qfalse; // no match - } - } - end - - class Match - attr_accessor :beg, :length, :mis, :ins, :del, :match - - def initialize(beg, length, mis, ins, del, match) - @beg = beg - @length = length - @mis = mis - @ins = ins - @del = del - @match = match - end - end -end - - -__END__