From: martinahansen Date: Mon, 7 Mar 2011 13:41:14 +0000 (+0000) Subject: added hamming distance to adaptor location/clipping in ruby lib X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=5d7cb646d7728d71cc0227b553cbcb6e179bc89a;p=biopieces.git added hamming distance to adaptor location/clipping in ruby lib git-svn-id: http://biopieces.googlecode.com/svn/trunk@1282 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/code_ruby/Maasha/lib/seq.rb b/code_ruby/Maasha/lib/seq.rb index c408323..1a7d698 100644 --- a/code_ruby/Maasha/lib/seq.rb +++ b/code_ruby/Maasha/lib/seq.rb @@ -308,16 +308,20 @@ class Seq # Method that locates an adaptor or part thereof in the sequence # of a Seq object beginning from the right. Returns the location # in the sequence that overlaps with the adaptor or -1 if the - # adaptor was not found. - def adaptor_locate_right(adaptor) + # adaptor was not found. The hd_percent is used to calculate the + # maximum hamming distance allowed for all possible overlaps. + def adaptor_locate_right(adaptor, hd_percent = 0) + raise SeqError, "Hamming distance percent out of range #{hd_percent}" unless (0 .. 100).include? hd_percent pos = self.length - adaptor.length while pos < self.length - len = self.length - pos - subseq = self.seq[pos ... pos + len].upcase - subadaptor = adaptor[0 ... len].upcase + len = self.length - pos + subseq = self.seq[pos ... pos + len].upcase + subadaptor = adaptor[0 ... len].upcase + hamming_max = (len * hd_percent * 0.01).round + hamming_dist = String.hamming_dist(subseq, subadaptor) + return pos if hamming_dist <= hamming_max - return pos if subseq == subadaptor pos += 1 end @@ -327,18 +331,22 @@ class Seq # Method that locates an adaptor or part thereof in the sequence # of a Seq object beginning from the left. Returns the location # in the sequence that overlaps with the adaptor or -1 if the - # adaptor was not found. - def adaptor_locate_left(adaptor) + # adaptor was not found. The hd_percent is used to calculate the + # maximum hamming distance allowed for all possible overlaps. + def adaptor_locate_left(adaptor, hd_percent = 0) + raise SeqError, "Hamming distance percent out of range #{hd_percent}" unless (0 .. 100).include? hd_percent pos = adaptor.length while pos > 0 - len = pos - subseq = self.seq[0 ... len].upcase - subadaptor = adaptor[adaptor.length - len ... adaptor.length].upcase + len = pos + subseq = self.seq[0 ... len].upcase + subadaptor = adaptor[adaptor.length - len ... adaptor.length].upcase + hamming_max = (len * hd_percent * 0.01).round + hamming_dist = String.hamming_dist(subseq, subadaptor) pos -= 1 - return pos if subseq == subadaptor + return pos if hamming_dist <= hamming_max end -1 @@ -346,9 +354,10 @@ class Seq # Method that locates an adaptor or part thereof in the sequence # of a Seq object beginning from the right and removes the adaptor - # sequence if found. - def adaptor_clip_right(adaptor) - pos = self.adaptor_locate_right(adaptor) + # sequence if found. The hd_percent is used to calculate the + # maximum hamming distance allowed for all possible overlaps. + def adaptor_clip_right(adaptor, hd_percent = 0) + pos = self.adaptor_locate_right(adaptor, hd_percent) if pos > 0 self.seq = self.seq[0 ... pos] @@ -358,9 +367,10 @@ class Seq # Method that locates an adaptor or part thereof in the sequence # of a Seq object beginning from the left and removes the adaptor - # sequence if found. - def adaptor_clip_left(adaptor) - pos = self.adaptor_locate_left(adaptor) + # sequence if found. The hd_percent is used to calculate the + # maximum hamming distance allowed for all possible overlaps. + def adaptor_clip_left(adaptor, hd_percent = 0) + pos = self.adaptor_locate_left(adaptor, hd_percent) if pos > 0 self.seq = self.seq[pos + 1 ... self.length] diff --git a/code_ruby/Maasha/test/test_seq.rb b/code_ruby/Maasha/test/test_seq.rb index 9296e14..25280c5 100755 --- a/code_ruby/Maasha/test/test_seq.rb +++ b/code_ruby/Maasha/test/test_seq.rb @@ -332,6 +332,19 @@ class TestSeq < Test::Unit::TestCase assert_equal(25.00, @entry.soft_mask) end + def test_Seq_adaptor_locate_right_with_bad_hamming_dist_raises + @entry.seq = "ATCG" + assert_raise(SeqError) { @entry.adaptor_locate_right("ATCG", -1) } + assert_raise(SeqError) { @entry.adaptor_locate_right("ATCG", 101) } + end + + def test_Seq_adaptor_locate_right_with_ok_hamming_dist_dont_raise + @entry.seq = "ATCG" + assert_nothing_raised { @entry.adaptor_locate_right("ATCG", 0) } + assert_nothing_raised { @entry.adaptor_locate_right("ATCG", 50.5) } + assert_nothing_raised { @entry.adaptor_locate_right("ATCG", 100) } + end + def test_Seq_adaptor_locate_right_returns_correctly @entry.seq = "nnnnncgat" assert_equal(-1, @entry.adaptor_locate_right("X")) @@ -342,6 +355,25 @@ class TestSeq < Test::Unit::TestCase assert_equal(0, @entry.adaptor_locate_right("NNNNNCGAT")) end + def test_Seq_adaptor_locate_right_with_hd_returns_correctly + @entry.seq = "nnnnncgat" + assert_equal(5, @entry.adaptor_locate_right("XGAT", 25)) + assert_equal(5, @entry.adaptor_locate_right("XXAT", 50)) + end + + def test_Seq_adaptor_locate_left_with_bad_hamming_dist_raises + @entry.seq = "ATCG" + assert_raise(SeqError) { @entry.adaptor_locate_left("ATCG", -1) } + assert_raise(SeqError) { @entry.adaptor_locate_left("ATCG", 101) } + end + + def test_Seq_adaptor_locate_left_with_ok_hamming_dist_dont_raise + @entry.seq = "ATCG" + assert_nothing_raised { @entry.adaptor_locate_left("ATCG", 0) } + assert_nothing_raised { @entry.adaptor_locate_left("ATCG", 50.5) } + assert_nothing_raised { @entry.adaptor_locate_left("ATCG", 100) } + end + def test_Seq_adaptor_locate_left_returns_correctly @entry.seq = "cgatnnnnn" assert_equal(-1, @entry.adaptor_locate_left("X")) @@ -352,12 +384,24 @@ class TestSeq < Test::Unit::TestCase assert_equal(8, @entry.adaptor_locate_left("CGATNNNNN")) end + def test_Seq_adaptor_locate_left_with_hd_returns_correctly + @entry.seq = "cgatnnnnn" + assert_equal(3, @entry.adaptor_locate_left("XGAT", 25)) + assert_equal(3, @entry.adaptor_locate_left("XXAT", 50)) + end + def test_Seq_adaptor_clip_right_returns_correct_sequence @entry.seq = "nnnnncgat" @entry.adaptor_clip_right("cgat") assert_equal( "nnnnn", @entry.seq) end + def test_Seq_adaptor_clip_right_with_hd_returns_correct_sequence + @entry.seq = "nnnnncgat" + @entry.adaptor_clip_right("xgat", 25) + assert_equal( "nnnnn", @entry.seq) + end + def test_Seq_adaptor_clip_right_returns_correct_qual @entry.seq = "nnnnncgat" @entry.qual = "abcdefghi" @@ -365,12 +409,25 @@ class TestSeq < Test::Unit::TestCase assert_equal( "abcde", @entry.qual) end + def test_Seq_adaptor_clip_right_with_hd_returns_correct_qual + @entry.seq = "nnnnncgat" + @entry.qual = "abcdefghi" + @entry.adaptor_clip_right("xgat", 25) + assert_equal( "abcde", @entry.qual) + end + def test_Seq_adaptor_clip_left_returns_correct_sequence @entry.seq = "cgatnnnnn" @entry.adaptor_clip_left("cgat") assert_equal( "nnnnn", @entry.seq) end + def test_Seq_adaptor_clip_left_with_hd_returns_correct_sequence + @entry.seq = "cgatnnnnn" + @entry.adaptor_clip_left("cgax", 25) + assert_equal( "nnnnn", @entry.seq) + end + def test_Seq_adaptor_clip_left_returns_correct_qual @entry.seq = "cgatnnnnn" @entry.qual = "abcdefghi" @@ -378,6 +435,13 @@ class TestSeq < Test::Unit::TestCase assert_equal( "efghi", @entry.qual) end + def test_Seq_adaptor_clip_left_returns_correct_qual + @entry.seq = "cgatnnnnn" + @entry.qual = "abcdefghi" + @entry.adaptor_clip_left("cgax", 25) + assert_equal( "efghi", @entry.qual) + end + def test_Digest_new_raises_on_bad_pattern_residue assert_raise(DigestError) { Digest.new(@entry, "X", 4) } end