From: martinahansen Date: Wed, 18 Sep 2013 18:17:08 +0000 (+0000) Subject: implemented hamming_dist in inline C X-Git-Url: https://git.donarmstrong.com/?p=biopieces.git;a=commitdiff_plain;h=92dba07b3dd9837ed90212126998a8a1f9e00652 implemented hamming_dist in inline C git-svn-id: http://biopieces.googlecode.com/svn/trunk@2203 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/code_ruby/lib/maasha/seq.rb b/code_ruby/lib/maasha/seq.rb index 73bf055..1d8f3c0 100644 --- a/code_ruby/lib/maasha/seq.rb +++ b/code_ruby/lib/maasha/seq.rb @@ -30,6 +30,7 @@ require 'narray' autoload :BackTrack, 'maasha/seq/backtrack' autoload :Dynamic, 'maasha/seq/dynamic' autoload :Homopolymer, 'maasha/seq/homopolymer' +autoload :Hamming, 'maasha/seq/hamming' autoload :Levenshtein, 'maasha/seq/levenshtein' autoload :Ambiguity, 'maasha/seq/ambiguity' @@ -387,7 +388,7 @@ class Seq # two Sequence objects (case insensitive). def hamming_distance(entry, options = nil) if options and options[:ambiguity] - Ambiguity.hamming_distance(self.seq, entry.seq) + Hamming.distance(self.seq, entry.seq) else self.seq.upcase.hamming_distance(entry.seq.upcase) end diff --git a/code_ruby/lib/maasha/seq/ambiguity.rb b/code_ruby/lib/maasha/seq/ambiguity.rb index 7c21aed..4de50ad 100644 --- a/code_ruby/lib/maasha/seq/ambiguity.rb +++ b/code_ruby/lib/maasha/seq/ambiguity.rb @@ -23,41 +23,12 @@ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< require 'inline' -#autoload :NArray, 'narray' module Ambiguity - # IUPAC alphabet and binary encoding of the same - # http://en.wikipedia.org/wiki/Nucleic_acid_notation - - AMBIGUITY_STR = "ACGTUWSMKRYBDHVNacgtuwsmkrybdhvn" - AMBIGUITY_BIN = "\x08\x04\x02\x01\x01\x09\x06\x0c\x03\x0a\x05\x07\x0b\x0d\x0e\x0f\x08\x04\x02\x01\x01\x09\x06\x0c\x03\x0a\x05\x07\x0b\x0d\x0e\x0f" - - # Class method to convert a sequence string to a bit string - # where the bit positions in each char corresponds to the following: - # A = 1000 - # C = 0100 - # G = 0010 - # T = 0001 - # And ambiguity codes are expressed using similar bit fields. - def self.to_bin(seq) - seq.tr(AMBIGUITY_STR, AMBIGUITY_BIN) - end - - # Class method to convert a bit string to a NArray. - def self.to_na(seq) - NArray.to_na(self.to_bin(seq), 'byte') - end - - # Class method to calculate the Hamming Distance between - # two bit fields encoding in NArrays. - def self.hamming_distance(seq1, seq2) - (self.to_na(seq1) & self.to_na(seq2)).eq(0).sum - end - def add_ambiguity_macro inline_builder # Macro for matching nucleotides including ambiguity codes. inline_builder.prefix %{ - #define MATCH(A,B) ((bitmap[A] & bitmap[B]) != 0) + #define MATCH(A,B) ((bitmap[(int) A] & bitmap[(int) B]) != 0) } # Bitmap for matching nucleotides including ambiguity codes. diff --git a/code_ruby/lib/maasha/seq/hamming.rb b/code_ruby/lib/maasha/seq/hamming.rb new file mode 100644 index 0000000..6616f73 --- /dev/null +++ b/code_ruby/lib/maasha/seq/hamming.rb @@ -0,0 +1,80 @@ +# Copyright (C) 2013 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This software is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +require 'inline' +require 'maasha/seq/ambiguity' + +# Class to calculate the Hamming distance between two +# given strings. +# http://en.wikipedia.org/wiki/Hamming_distance +class Hamming + extend Ambiguity + + # Class method for calculating the Hamming distance between + # two given strings allowing for IUPAC ambiguity codes. + def self.distance(str1, str2) + raise "string length mismatch: #{str1.length} != #{str2.length}" if str1.length != str2.length + + hd = self.new + hd.hamming_distance_C(str1, str2, str1.length) + end + + # >>>>>>>>>>>>>>> RubyInline C code <<<<<<<<<<<<<<< + + inline do |builder| + add_ambiguity_macro(builder) + + # C method for calculating Hamming Distance. + builder.c %{ + VALUE hamming_distance_C( + VALUE _str1, // String 1 + VALUE _str2, // String 2 + VALUE _len // String length + ) + { + char *str1 = StringValuePtr(_str1); + char *str2 = StringValuePtr(_str2); + unsigned int len = FIX2UINT(_len); + + unsigned int hamming_dist = 0; + unsigned int i = 0; + + for (i = 0; i < len; i++) + { + if (! MATCH(str1[i], str2[i])) + { + hamming_dist++; + } + } + + return UINT2NUM(hamming_dist); + } + } + end +end + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ diff --git a/code_ruby/lib/maasha/seq/levenshtein.rb b/code_ruby/lib/maasha/seq/levenshtein.rb index 2e34de0..5273d96 100644 --- a/code_ruby/lib/maasha/seq/levenshtein.rb +++ b/code_ruby/lib/maasha/seq/levenshtein.rb @@ -42,7 +42,7 @@ class Levenshtein v1 = "\0" * (t.length + 1) * BYTES_IN_INT l = self.new - l.distance_C(s, t, s.length, t.length, v0, v1) + l.levenshtein_distance_C(s, t, s.length, t.length, v0, v1) end # >>>>>>>>>>>>>>> RubyInline C code <<<<<<<<<<<<<<< @@ -63,7 +63,7 @@ class Levenshtein } builder.c %{ - VALUE distance_C( + VALUE levenshtein_distance_C( VALUE _s, // string VALUE _t, // string VALUE _s_len, // string length diff --git a/code_ruby/test/maasha/seq/test_hamming.rb b/code_ruby/test/maasha/seq/test_hamming.rb new file mode 100755 index 0000000..8523fe9 --- /dev/null +++ b/code_ruby/test/maasha/seq/test_hamming.rb @@ -0,0 +1,44 @@ +#!/usr/bin/env ruby +$:.unshift File.join(File.dirname(__FILE__), '..', '..') + +# Copyright (C) 2013 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This software is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +require 'test/unit' +require 'test/helper' +require 'maasha/seq/hamming' + +class HammingTest < Test::Unit::TestCase + test "Hamming.distance returns 0 with identical strings" do + assert_equal(0, Hamming.distance("atcg", "atcg")) + end + + test "Hamming.distance with mismatch only returns correctly" do + assert_equal(1, Hamming.distance("atcg", "atgg")) + end + + test "Hamming.distance with ambiguity code returns correctly" do + assert_equal(0, Hamming.distance("atcg", "nnnn")) + end +end