]> git.donarmstrong.com Git - biopieces.git/commitdiff
added hamming distance to adaptor location/clipping in ruby lib
authormartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Mon, 7 Mar 2011 13:41:14 +0000 (13:41 +0000)
committermartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Mon, 7 Mar 2011 13:41:14 +0000 (13:41 +0000)
git-svn-id: http://biopieces.googlecode.com/svn/trunk@1282 74ccb610-7750-0410-82ae-013aeee3265d

code_ruby/Maasha/lib/seq.rb
code_ruby/Maasha/test/test_seq.rb

index c4083237ff38018e3ace039eaac3fed5c32a5c76..1a7d698299c3259b87f9904fdbfd145506126e95 100644 (file)
@@ -308,16 +308,20 @@ class Seq
   # Method that locates an adaptor or part thereof in the sequence
   # of a Seq object beginning from the right. Returns the location
   # in the sequence that overlaps with the adaptor or -1 if the
-  # adaptor was not found.
-  def adaptor_locate_right(adaptor)
+  # adaptor was not found. The hd_percent is used to calculate the
+  # maximum hamming distance allowed for all possible overlaps.
+  def adaptor_locate_right(adaptor, hd_percent = 0)
+    raise SeqError, "Hamming distance percent out of range #{hd_percent}" unless (0 .. 100).include? hd_percent
     pos = self.length - adaptor.length
 
     while pos < self.length
-      len        = self.length - pos
-      subseq     = self.seq[pos ... pos + len].upcase
-      subadaptor = adaptor[0 ... len].upcase
+      len          = self.length - pos
+      subseq       = self.seq[pos ... pos + len].upcase
+      subadaptor   = adaptor[0 ... len].upcase
+      hamming_max  = (len * hd_percent * 0.01).round
+      hamming_dist = String.hamming_dist(subseq, subadaptor)
+      return pos if hamming_dist <= hamming_max
 
-      return pos if subseq == subadaptor
       pos += 1
     end
 
@@ -327,18 +331,22 @@ class Seq
   # Method that locates an adaptor or part thereof in the sequence
   # of a Seq object beginning from the left. Returns the location
   # in the sequence that overlaps with the adaptor or -1 if the
-  # adaptor was not found.
-  def adaptor_locate_left(adaptor)
+  # adaptor was not found. The hd_percent is used to calculate the
+  # maximum hamming distance allowed for all possible overlaps.
+  def adaptor_locate_left(adaptor, hd_percent = 0)
+    raise SeqError, "Hamming distance percent out of range #{hd_percent}" unless (0 .. 100).include? hd_percent
     pos = adaptor.length
 
     while pos > 0
-      len        = pos
-      subseq     = self.seq[0 ... len].upcase
-      subadaptor = adaptor[adaptor.length - len ... adaptor.length].upcase
+      len          = pos
+      subseq       = self.seq[0 ... len].upcase
+      subadaptor   = adaptor[adaptor.length - len ... adaptor.length].upcase
+      hamming_max  = (len * hd_percent * 0.01).round
+      hamming_dist = String.hamming_dist(subseq, subadaptor)
 
       pos -= 1
 
-      return pos if subseq == subadaptor
+      return pos if hamming_dist <= hamming_max
     end
     
     -1
@@ -346,9 +354,10 @@ class Seq
 
   # Method that locates an adaptor or part thereof in the sequence
   # of a Seq object beginning from the right and removes the adaptor
-  # sequence if found.
-  def adaptor_clip_right(adaptor)
-    pos = self.adaptor_locate_right(adaptor)
+  # sequence if found. The hd_percent is used to calculate the
+  # maximum hamming distance allowed for all possible overlaps.
+  def adaptor_clip_right(adaptor, hd_percent = 0)
+    pos = self.adaptor_locate_right(adaptor, hd_percent)
 
     if pos > 0
       self.seq  = self.seq[0 ... pos]
@@ -358,9 +367,10 @@ class Seq
 
   # Method that locates an adaptor or part thereof in the sequence
   # of a Seq object beginning from the left and removes the adaptor
-  # sequence if found.
-  def adaptor_clip_left(adaptor)
-    pos = self.adaptor_locate_left(adaptor)
+  # sequence if found. The hd_percent is used to calculate the
+  # maximum hamming distance allowed for all possible overlaps.
+  def adaptor_clip_left(adaptor, hd_percent = 0)
+    pos = self.adaptor_locate_left(adaptor, hd_percent)
 
     if pos > 0
       self.seq  = self.seq[pos + 1 ... self.length]
index 9296e14c9eb57e23457a34228c2011854fb7e8e0..25280c5eeca5a0304f4bc2e892337e22dde072d0 100755 (executable)
@@ -332,6 +332,19 @@ class TestSeq < Test::Unit::TestCase
     assert_equal(25.00, @entry.soft_mask)
   end
 
+  def test_Seq_adaptor_locate_right_with_bad_hamming_dist_raises
+    @entry.seq = "ATCG"
+    assert_raise(SeqError) { @entry.adaptor_locate_right("ATCG", -1) }
+    assert_raise(SeqError) { @entry.adaptor_locate_right("ATCG", 101) }
+  end
+
+  def test_Seq_adaptor_locate_right_with_ok_hamming_dist_dont_raise
+    @entry.seq = "ATCG"
+    assert_nothing_raised { @entry.adaptor_locate_right("ATCG", 0) }
+    assert_nothing_raised { @entry.adaptor_locate_right("ATCG", 50.5) }
+    assert_nothing_raised { @entry.adaptor_locate_right("ATCG", 100) }
+  end
+
   def test_Seq_adaptor_locate_right_returns_correctly
     @entry.seq = "nnnnncgat"
     assert_equal(-1, @entry.adaptor_locate_right("X"))
@@ -342,6 +355,25 @@ class TestSeq < Test::Unit::TestCase
     assert_equal(0,  @entry.adaptor_locate_right("NNNNNCGAT"))
   end
 
+  def test_Seq_adaptor_locate_right_with_hd_returns_correctly
+    @entry.seq = "nnnnncgat"
+    assert_equal(5, @entry.adaptor_locate_right("XGAT", 25))
+    assert_equal(5, @entry.adaptor_locate_right("XXAT", 50))
+  end
+
+  def test_Seq_adaptor_locate_left_with_bad_hamming_dist_raises
+    @entry.seq = "ATCG"
+    assert_raise(SeqError) { @entry.adaptor_locate_left("ATCG", -1) }
+    assert_raise(SeqError) { @entry.adaptor_locate_left("ATCG", 101) }
+  end
+
+  def test_Seq_adaptor_locate_left_with_ok_hamming_dist_dont_raise
+    @entry.seq = "ATCG"
+    assert_nothing_raised { @entry.adaptor_locate_left("ATCG", 0) }
+    assert_nothing_raised { @entry.adaptor_locate_left("ATCG", 50.5) }
+    assert_nothing_raised { @entry.adaptor_locate_left("ATCG", 100) }
+  end
+
   def test_Seq_adaptor_locate_left_returns_correctly
     @entry.seq = "cgatnnnnn"
     assert_equal(-1, @entry.adaptor_locate_left("X"))
@@ -352,12 +384,24 @@ class TestSeq < Test::Unit::TestCase
     assert_equal(8,  @entry.adaptor_locate_left("CGATNNNNN"))
   end
 
+  def test_Seq_adaptor_locate_left_with_hd_returns_correctly
+    @entry.seq = "cgatnnnnn"
+    assert_equal(3, @entry.adaptor_locate_left("XGAT", 25))
+    assert_equal(3, @entry.adaptor_locate_left("XXAT", 50))
+  end
+
   def test_Seq_adaptor_clip_right_returns_correct_sequence
     @entry.seq = "nnnnncgat"
     @entry.adaptor_clip_right("cgat")
     assert_equal( "nnnnn", @entry.seq)
   end
 
+  def test_Seq_adaptor_clip_right_with_hd_returns_correct_sequence
+    @entry.seq = "nnnnncgat"
+    @entry.adaptor_clip_right("xgat", 25)
+    assert_equal( "nnnnn", @entry.seq)
+  end
+
   def test_Seq_adaptor_clip_right_returns_correct_qual
     @entry.seq  = "nnnnncgat"
     @entry.qual = "abcdefghi"
@@ -365,12 +409,25 @@ class TestSeq < Test::Unit::TestCase
     assert_equal( "abcde", @entry.qual)
   end
 
+  def test_Seq_adaptor_clip_right_with_hd_returns_correct_qual
+    @entry.seq  = "nnnnncgat"
+    @entry.qual = "abcdefghi"
+    @entry.adaptor_clip_right("xgat", 25)
+    assert_equal( "abcde", @entry.qual)
+  end
+
   def test_Seq_adaptor_clip_left_returns_correct_sequence
     @entry.seq = "cgatnnnnn"
     @entry.adaptor_clip_left("cgat")
     assert_equal( "nnnnn", @entry.seq)
   end
 
+  def test_Seq_adaptor_clip_left_with_hd_returns_correct_sequence
+    @entry.seq = "cgatnnnnn"
+    @entry.adaptor_clip_left("cgax", 25)
+    assert_equal( "nnnnn", @entry.seq)
+  end
+
   def test_Seq_adaptor_clip_left_returns_correct_qual
     @entry.seq  = "cgatnnnnn"
     @entry.qual = "abcdefghi"
@@ -378,6 +435,13 @@ class TestSeq < Test::Unit::TestCase
     assert_equal( "efghi", @entry.qual)
   end
 
+  def test_Seq_adaptor_clip_left_returns_correct_qual
+    @entry.seq  = "cgatnnnnn"
+    @entry.qual = "abcdefghi"
+    @entry.adaptor_clip_left("cgax", 25)
+    assert_equal( "efghi", @entry.qual)
+  end
+
   def test_Digest_new_raises_on_bad_pattern_residue
     assert_raise(DigestError) { Digest.new(@entry, "X", 4) }
   end