From cdbd4497b13b247333ac445663994785978481df Mon Sep 17 00:00:00 2001 From: martinahansen Date: Sun, 20 Mar 2011 21:40:28 +0000 Subject: [PATCH] plain and hd match in ruby Seq.rb now works git-svn-id: http://biopieces.googlecode.com/svn/trunk@1304 74ccb610-7750-0410-82ae-013aeee3265d --- code_ruby/Maasha/lib/seq.rb | 78 ++++++++++++++++--------------- code_ruby/Maasha/test/test_seq.rb | 9 +++- 2 files changed, 47 insertions(+), 40 deletions(-) diff --git a/code_ruby/Maasha/lib/seq.rb b/code_ruby/Maasha/lib/seq.rb index 5dedb36..a922fc7 100644 --- a/code_ruby/Maasha/lib/seq.rb +++ b/code_ruby/Maasha/lib/seq.rb @@ -36,7 +36,7 @@ INDELS = %w[. - _ ~] SCORE_PHRED = 33 SCORE_ILLUMINA = 64 -# Nucleotide equivalents +# IUPAC nucleotide pair ambiguity equivalents EQUAL = { :AA => true, :BU => true, :TH => true, :UY => true, :TT => true, :CB => true, :UH => true, :SC => true, @@ -435,49 +435,51 @@ class Seq # or nil if no match was found. Hamming or Edit distance may be specified. def match(pattern, pos = 0, hd = 0, ed = 0) while pos < self.length - pattern.length + 1 - str1 = self.seq[pos ... pos + pattern.length] - str2 = pattern - - rows = str1.length + 1 - cols = str2.length + 1 - - matches = 0 - mismatches = 0 - insertions = 0 - deletions = 0 - - matrix = NArray.int(rows, cols) - - for i in 0 ... rows do matrix[i, 0] = i end - for j in 0 ... cols do matrix[0, j] = j end - - for j in 1 ... cols do - for i in 1 ... rows do - if EQUAL[(str1[i - 1].upcase + str2[j - 1].upcase).to_sym] - matrix[i, j] = matrix[i - 1, j - 1] - matches += 1 - else - del = matrix[i - 1, j] + 1 - ins = matrix[i, j - 1] + 1 - mis = matrix[i - 1, j - 1] + 1 - - if del < ins and del < mis - deletions += 1 - matrix[i, j] = del - elsif ins < del and ins < mis - insertions += 1 - matrix[i, j] = ins + if ed == 0 + match = 0 + mis = 0 + + for i in 0 ... pattern.length do + EQUAL[(self.seq[pos + i].upcase + pattern[i].upcase).to_sym] ? match += 1 : mis += 1 + + break if mis > hd + end + + if pattern.length - hd == match + return pos + else + pos += (match - hd >= 0) ? 1 + match - hd : 1 + end + else + str1 = self.seq[pos ... pos + pattern.length] + str2 = pattern + + rows = str1.length + 1 + cols = str2.length + 1 + + matrix = NArray.int(rows, cols) + + for i in 0 ... rows do matrix[i, 0] = i end + for j in 0 ... cols do matrix[0, j] = j end + + for j in 1 ... cols do + for i in 1 ... rows do + if EQUAL[(str1[i - 1].upcase + str2[j - 1].upcase).to_sym] + matrix[i, j] = matrix[i - 1, j - 1] else - mismatches += 1 - matrix[i, j] = mis + del = matrix[i - 1, j] + 1 + ins = matrix[i, j - 1] + 1 + mis = matrix[i - 1, j - 1] + 1 + + matrix[i, j] = [del, ins, mis].min end end end - end - return pos if matrix[rows - 1, cols - 1] <= hd + return pos if matrix[rows - 1, cols - 1] <= ed - pos += 1 + pos += 1 + end end end diff --git a/code_ruby/Maasha/test/test_seq.rb b/code_ruby/Maasha/test/test_seq.rb index f8ae29e..0e986f6 100755 --- a/code_ruby/Maasha/test/test_seq.rb +++ b/code_ruby/Maasha/test/test_seq.rb @@ -448,9 +448,9 @@ class TestSeq < Test::Unit::TestCase end def test_Seq_match_returns_correctly - @entry.seq = "atcg" + @entry.seq = "atcgatct" assert_equal(0, @entry.match("aTc")) - assert_equal(1, @entry.match("Ncg")) + assert_equal(4, @entry.match("aNct")) end def test_Seq_match_with_pos_returns_correctly @@ -465,6 +465,11 @@ class TestSeq < Test::Unit::TestCase assert_equal(0, @entry.match("XXCX", pos=0, hd=3)) assert_equal(0, @entry.match("XXXX", pos=0, hd=4)) end + + def test_Seq_match_with_pos_and_hamming_dist_returns_correctly + @entry.seq = "atcgttcg" + assert_equal(4, @entry.match("ATCG", pos=1, hd=1)) + end end -- 2.39.5