From: martinahansen Date: Fri, 21 Jun 2013 09:09:29 +0000 (+0000) Subject: fixed ambiguity in levenshtein.rb X-Git-Url: https://git.donarmstrong.com/?p=biopieces.git;a=commitdiff_plain;h=9ff3f4f82dedb5ccaa924fe311486bad3ed1248d fixed ambiguity in levenshtein.rb git-svn-id: http://biopieces.googlecode.com/svn/trunk@2183 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/code_ruby/lib/maasha/levenshtein.rb b/code_ruby/lib/maasha/levenshtein.rb deleted file mode 100644 index bbc2c70..0000000 --- a/code_ruby/lib/maasha/levenshtein.rb +++ /dev/null @@ -1,107 +0,0 @@ -# Copyright (C) 2013 Martin A. Hansen. - -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - -# http://www.gnu.org/copyleft/gpl.html - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -# This software is part of the Biopieces framework (www.biopieces.org). - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -require 'inline' - -# Class to calculate the Levenshtein distance between two -# given strings. -# http://en.wikipedia.org/wiki/Levenshtein_distance -class Levenshtein - BYTES_IN_INT = 4 - - def self.distance(s, t) - return 0 if s == t; - return t.length if s.length == 0; - return s.length if t.length == 0; - - v0 = "\0" * (t.length + 1) * BYTES_IN_INT - v1 = "\0" * (t.length + 1) * BYTES_IN_INT - - l = self.new - l.distance_C(s, t, s.length, t.length, v0, v1) - end - - # >>>>>>>>>>>>>>> RubyInline C code <<<<<<<<<<<<<<< - - inline do |builder| - builder.prefix %{ - unsigned int min(unsigned int a, unsigned int b, unsigned int c) - { - unsigned int m = a; - - if (m > b) m = b; - if (m > c) m = c; - - return m; - } - } - - builder.c %{ - VALUE distance_C( - VALUE _s, // string - VALUE _t, // string - VALUE _s_len, // string length - VALUE _t_len, // string length - VALUE _v0, // score vector - VALUE _v1 // score vector - ) - { - char *s = (char *) StringValuePtr(_s); - char *t = (char *) StringValuePtr(_t); - unsigned int s_len = FIX2UINT(_s_len); - unsigned int t_len = FIX2UINT(_t_len); - unsigned int *v0 = (unsigned int *) StringValuePtr(_v0); - unsigned int *v1 = (unsigned int *) StringValuePtr(_v1); - - unsigned int i = 0; - unsigned int j = 0; - unsigned int cost = 0; - - for (i = 0; i < t_len + 1; i++) - v0[i] = i; - - for (i = 0; i < s_len; i++) - { - v1[0] = i + 1; - - for (j = 0; j < t_len; j++) - { - cost = (s[i] == t[j]) ? 0 : 1; - v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost); - } - - for (j = 0; j < t_len + 1; j++) - v0[j] = v1[j]; - } - - return UINT2NUM(v1[t_len]); - } - } - end -end - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -__END__ diff --git a/code_ruby/lib/maasha/seq/levenshtein.rb b/code_ruby/lib/maasha/seq/levenshtein.rb new file mode 100644 index 0000000..b9f8243 --- /dev/null +++ b/code_ruby/lib/maasha/seq/levenshtein.rb @@ -0,0 +1,136 @@ +# Copyright (C) 2013 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This software is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +require 'inline' + +# Class to calculate the Levenshtein distance between two +# given strings. +# http://en.wikipedia.org/wiki/Levenshtein_distance +class Levenshtein + BYTES_IN_INT = 4 + + def self.distance(s, t) + return 0 if s == t; + return t.length if s.length == 0; + return s.length if t.length == 0; + + v0 = "\0" * (t.length + 1) * BYTES_IN_INT + v1 = "\0" * (t.length + 1) * BYTES_IN_INT + + l = self.new + l.distance_C(s, t, s.length, t.length, v0, v1) + end + + # >>>>>>>>>>>>>>> RubyInline C code <<<<<<<<<<<<<<< + + inline do |builder| + # Macro for matching nucleotides including ambiguity codes. + builder.prefix %{ + #define MATCH(A,B) ((bitmap[A] & bitmap[B]) != 0) + } + + # Bitmap for matching nucleotides including ambiguity codes. + # For each value bits are set from the left: bit pos 1 for A, + # bit pos 2 for T, bit pos 3 for C, and bit pos 4 for G. + builder.prefix %{ + char bitmap[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0, + 0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0, + 0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0, + 0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + } + + builder.prefix %{ + unsigned int min(unsigned int a, unsigned int b, unsigned int c) + { + unsigned int m = a; + + if (m > b) m = b; + if (m > c) m = c; + + return m; + } + } + + builder.c %{ + VALUE distance_C( + VALUE _s, // string + VALUE _t, // string + VALUE _s_len, // string length + VALUE _t_len, // string length + VALUE _v0, // score vector + VALUE _v1 // score vector + ) + { + char *s = (char *) StringValuePtr(_s); + char *t = (char *) StringValuePtr(_t); + unsigned int s_len = FIX2UINT(_s_len); + unsigned int t_len = FIX2UINT(_t_len); + unsigned int *v0 = (unsigned int *) StringValuePtr(_v0); + unsigned int *v1 = (unsigned int *) StringValuePtr(_v1); + + unsigned int i = 0; + unsigned int j = 0; + unsigned int cost = 0; + + for (i = 0; i < t_len + 1; i++) + v0[i] = i; + + for (i = 0; i < s_len; i++) + { + v1[0] = i + 1; + + for (j = 0; j < t_len; j++) + { + cost = (MATCH(s[i], t[j])) ? 0 : 1; + v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost); + } + + for (j = 0; j < t_len + 1; j++) + v0[j] = v1[j]; + } + + return UINT2NUM(v1[t_len]); + } + } + end +end + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__