From 32cc7b1e1358cade767a865dd0db99470b4df2ba Mon Sep 17 00:00:00 2001 From: martinahansen Date: Thu, 20 Jun 2013 21:05:55 +0000 Subject: [PATCH] added levenshtein edit distance calculator git-svn-id: http://biopieces.googlecode.com/svn/trunk@2180 74ccb610-7750-0410-82ae-013aeee3265d --- code_ruby/lib/maasha/levenshtein.rb | 107 ++++++++++++++++++++++ code_ruby/test/maasha/test_levenshtein.rb | 53 +++++++++++ 2 files changed, 160 insertions(+) create mode 100644 code_ruby/lib/maasha/levenshtein.rb create mode 100755 code_ruby/test/maasha/test_levenshtein.rb diff --git a/code_ruby/lib/maasha/levenshtein.rb b/code_ruby/lib/maasha/levenshtein.rb new file mode 100644 index 0000000..1447e17 --- /dev/null +++ b/code_ruby/lib/maasha/levenshtein.rb @@ -0,0 +1,107 @@ +# Copyright (C) 2013 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This software is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +require 'inline' + +class Levenshtein + BYTES_IN_INT = 4 + + def self.distance(s, t) + return 0 if s == t; + return t.length if s.length == 0; + return s.length if t.length == 0; + + v0 = "\0" * (t.length + 1) * BYTES_IN_INT + v1 = "\0" * (t.length + 1) * BYTES_IN_INT + + l = self.new + l.distance_C(s, t, s.length, t.length, v0, v1) + end + + def initialize + end + + # >>>>>>>>>>>>>>> RubyInline C code <<<<<<<<<<<<<<< + + inline do |builder| + builder.prefix %{ + unsigned int min(unsigned int a, unsigned int b, unsigned int c) + { + unsigned int m = a; + + if (m > b) m = b; + if (m > c) m = c; + + return m; + } + } + + builder.c %{ + VALUE distance_C( + VALUE _s, + VALUE _t, + VALUE _s_len, + VALUE _t_len, + VALUE _v0, + VALUE _v1 + ) + { + char *s = (char *) StringValuePtr(_s); + char *t = (char *) StringValuePtr(_t); + unsigned int s_len = FIX2UINT(_s_len); + unsigned int t_len = FIX2UINT(_t_len); + unsigned int *v0 = (unsigned int *) StringValuePtr(_v0); + unsigned int *v1 = (unsigned int *) StringValuePtr(_v1); + + unsigned int i = 0; + unsigned int j = 0; + unsigned int cost = 0; + + for (i = 0; i < t_len + 1; i++) + v0[i] = i; + + for (i = 0; i < s_len; i++) + { + v1[0] = i + 1; + + for (j = 0; j < t_len; j++) + { + cost = (s[i] == t[j]) ? 0 : 1; + v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost); + } + + for (j = 0; j < t_len + 1; j++) + v0[j] = v1[j]; + } + + return UINT2NUM(v1[t_len]); + } + } + end +end + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ diff --git a/code_ruby/test/maasha/test_levenshtein.rb b/code_ruby/test/maasha/test_levenshtein.rb new file mode 100755 index 0000000..fca9029 --- /dev/null +++ b/code_ruby/test/maasha/test_levenshtein.rb @@ -0,0 +1,53 @@ +#!/usr/bin/env ruby +$:.unshift File.join(File.dirname(__FILE__), '..', '..') + +# Copyright (C) 2013 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This software is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +require 'test/unit' +require 'test/helper' +require 'maasha/levenshtein' + +class LevenshteinTest < Test::Unit::TestCase + test "Levenshtein.distance returns 0 with identical strings" do + assert_equal(0, Levenshtein.distance("foo", "foo")) + end + + test "Levenshtein.distance returns s length if t is empty" do + assert_equal(3, Levenshtein.distance("foo", "")) + end + + test "Levenshtein.distance returns t length if s is empty" do + assert_equal(5, Levenshtein.distance("", "fobar")) + end + + test "Levenshtein.distance with mismatch only returns correctly" do + assert_equal(1, Levenshtein.distance("foo", "fox")) + end + + test "Levenshtein.distance returns correctly" do + assert_equal(3, Levenshtein.distance("kitten", "sitting")) + assert_equal(3, Levenshtein.distance("Saturday", "Sunday")) + end +end -- 2.39.2