autoload :BackTrack, 'maasha/seq/backtrack'
autoload :Dynamic, 'maasha/seq/dynamic'
autoload :Homopolymer, 'maasha/seq/homopolymer'
+autoload :Hamming, 'maasha/seq/hamming'
autoload :Levenshtein, 'maasha/seq/levenshtein'
autoload :Ambiguity, 'maasha/seq/ambiguity'
# two Sequence objects (case insensitive).
def hamming_distance(entry, options = nil)
if options and options[:ambiguity]
- Ambiguity.hamming_distance(self.seq, entry.seq)
+ Hamming.distance(self.seq, entry.seq)
else
self.seq.upcase.hamming_distance(entry.seq.upcase)
end
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
require 'inline'
-#autoload :NArray, 'narray'
module Ambiguity
- # IUPAC alphabet and binary encoding of the same
- # http://en.wikipedia.org/wiki/Nucleic_acid_notation
-
- AMBIGUITY_STR = "ACGTUWSMKRYBDHVNacgtuwsmkrybdhvn"
- AMBIGUITY_BIN = "\x08\x04\x02\x01\x01\x09\x06\x0c\x03\x0a\x05\x07\x0b\x0d\x0e\x0f\x08\x04\x02\x01\x01\x09\x06\x0c\x03\x0a\x05\x07\x0b\x0d\x0e\x0f"
-
- # Class method to convert a sequence string to a bit string
- # where the bit positions in each char corresponds to the following:
- # A = 1000
- # C = 0100
- # G = 0010
- # T = 0001
- # And ambiguity codes are expressed using similar bit fields.
- def self.to_bin(seq)
- seq.tr(AMBIGUITY_STR, AMBIGUITY_BIN)
- end
-
- # Class method to convert a bit string to a NArray.
- def self.to_na(seq)
- NArray.to_na(self.to_bin(seq), 'byte')
- end
-
- # Class method to calculate the Hamming Distance between
- # two bit fields encoding in NArrays.
- def self.hamming_distance(seq1, seq2)
- (self.to_na(seq1) & self.to_na(seq2)).eq(0).sum
- end
-
def add_ambiguity_macro inline_builder
# Macro for matching nucleotides including ambiguity codes.
inline_builder.prefix %{
- #define MATCH(A,B) ((bitmap[A] & bitmap[B]) != 0)
+ #define MATCH(A,B) ((bitmap[(int) A] & bitmap[(int) B]) != 0)
}
# Bitmap for matching nucleotides including ambiguity codes.
--- /dev/null
+# Copyright (C) 2013 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# This software is part of the Biopieces framework (www.biopieces.org).
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+require 'inline'
+require 'maasha/seq/ambiguity'
+
+# Class to calculate the Hamming distance between two
+# given strings.
+# http://en.wikipedia.org/wiki/Hamming_distance
+class Hamming
+ extend Ambiguity
+
+ # Class method for calculating the Hamming distance between
+ # two given strings allowing for IUPAC ambiguity codes.
+ def self.distance(str1, str2)
+ raise "string length mismatch: #{str1.length} != #{str2.length}" if str1.length != str2.length
+
+ hd = self.new
+ hd.hamming_distance_C(str1, str2, str1.length)
+ end
+
+ # >>>>>>>>>>>>>>> RubyInline C code <<<<<<<<<<<<<<<
+
+ inline do |builder|
+ add_ambiguity_macro(builder)
+
+ # C method for calculating Hamming Distance.
+ builder.c %{
+ VALUE hamming_distance_C(
+ VALUE _str1, // String 1
+ VALUE _str2, // String 2
+ VALUE _len // String length
+ )
+ {
+ char *str1 = StringValuePtr(_str1);
+ char *str2 = StringValuePtr(_str2);
+ unsigned int len = FIX2UINT(_len);
+
+ unsigned int hamming_dist = 0;
+ unsigned int i = 0;
+
+ for (i = 0; i < len; i++)
+ {
+ if (! MATCH(str1[i], str2[i]))
+ {
+ hamming_dist++;
+ }
+ }
+
+ return UINT2NUM(hamming_dist);
+ }
+ }
+ end
+end
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+__END__
v1 = "\0" * (t.length + 1) * BYTES_IN_INT
l = self.new
- l.distance_C(s, t, s.length, t.length, v0, v1)
+ l.levenshtein_distance_C(s, t, s.length, t.length, v0, v1)
end
# >>>>>>>>>>>>>>> RubyInline C code <<<<<<<<<<<<<<<
}
builder.c %{
- VALUE distance_C(
+ VALUE levenshtein_distance_C(
VALUE _s, // string
VALUE _t, // string
VALUE _s_len, // string length
--- /dev/null
+#!/usr/bin/env ruby
+$:.unshift File.join(File.dirname(__FILE__), '..', '..')
+
+# Copyright (C) 2013 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# This software is part of the Biopieces framework (www.biopieces.org).
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+require 'test/unit'
+require 'test/helper'
+require 'maasha/seq/hamming'
+
+class HammingTest < Test::Unit::TestCase
+ test "Hamming.distance returns 0 with identical strings" do
+ assert_equal(0, Hamming.distance("atcg", "atcg"))
+ end
+
+ test "Hamming.distance with mismatch only returns correctly" do
+ assert_equal(1, Hamming.distance("atcg", "atgg"))
+ end
+
+ test "Hamming.distance with ambiguity code returns correctly" do
+ assert_equal(0, Hamming.distance("atcg", "nnnn"))
+ end
+end