SCORE_PHRED = 33
SCORE_ILLUMINA = 64
-# Nucleotide equivalents
+# IUPAC nucleotide pair ambiguity equivalents
EQUAL = {
:AA => true, :BU => true, :TH => true, :UY => true,
:TT => true, :CB => true, :UH => true, :SC => true,
# or nil if no match was found. Hamming or Edit distance may be specified.
def match(pattern, pos = 0, hd = 0, ed = 0)
while pos < self.length - pattern.length + 1
- str1 = self.seq[pos ... pos + pattern.length]
- str2 = pattern
-
- rows = str1.length + 1
- cols = str2.length + 1
-
- matches = 0
- mismatches = 0
- insertions = 0
- deletions = 0
-
- matrix = NArray.int(rows, cols)
-
- for i in 0 ... rows do matrix[i, 0] = i end
- for j in 0 ... cols do matrix[0, j] = j end
-
- for j in 1 ... cols do
- for i in 1 ... rows do
- if EQUAL[(str1[i - 1].upcase + str2[j - 1].upcase).to_sym]
- matrix[i, j] = matrix[i - 1, j - 1]
- matches += 1
- else
- del = matrix[i - 1, j] + 1
- ins = matrix[i, j - 1] + 1
- mis = matrix[i - 1, j - 1] + 1
-
- if del < ins and del < mis
- deletions += 1
- matrix[i, j] = del
- elsif ins < del and ins < mis
- insertions += 1
- matrix[i, j] = ins
+ if ed == 0
+ match = 0
+ mis = 0
+
+ for i in 0 ... pattern.length do
+ EQUAL[(self.seq[pos + i].upcase + pattern[i].upcase).to_sym] ? match += 1 : mis += 1
+
+ break if mis > hd
+ end
+
+ if pattern.length - hd == match
+ return pos
+ else
+ pos += (match - hd >= 0) ? 1 + match - hd : 1
+ end
+ else
+ str1 = self.seq[pos ... pos + pattern.length]
+ str2 = pattern
+
+ rows = str1.length + 1
+ cols = str2.length + 1
+
+ matrix = NArray.int(rows, cols)
+
+ for i in 0 ... rows do matrix[i, 0] = i end
+ for j in 0 ... cols do matrix[0, j] = j end
+
+ for j in 1 ... cols do
+ for i in 1 ... rows do
+ if EQUAL[(str1[i - 1].upcase + str2[j - 1].upcase).to_sym]
+ matrix[i, j] = matrix[i - 1, j - 1]
else
- mismatches += 1
- matrix[i, j] = mis
+ del = matrix[i - 1, j] + 1
+ ins = matrix[i, j - 1] + 1
+ mis = matrix[i - 1, j - 1] + 1
+
+ matrix[i, j] = [del, ins, mis].min
end
end
end
- end
- return pos if matrix[rows - 1, cols - 1] <= hd
+ return pos if matrix[rows - 1, cols - 1] <= ed
- pos += 1
+ pos += 1
+ end
end
end
end
def test_Seq_match_returns_correctly
- @entry.seq = "atcg"
+ @entry.seq = "atcgatct"
assert_equal(0, @entry.match("aTc"))
- assert_equal(1, @entry.match("Ncg"))
+ assert_equal(4, @entry.match("aNct"))
end
def test_Seq_match_with_pos_returns_correctly
assert_equal(0, @entry.match("XXCX", pos=0, hd=3))
assert_equal(0, @entry.match("XXXX", pos=0, hd=4))
end
+
+ def test_Seq_match_with_pos_and_hamming_dist_returns_correctly
+ @entry.seq = "atcgttcg"
+ assert_equal(4, @entry.match("ATCG", pos=1, hd=1))
+ end
end