From 207879dff45ee3307ab029af43e5031b833e0326 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Fri, 17 Sep 2010 09:59:00 +0000 Subject: [PATCH] added Digest class to ruby code git-svn-id: http://biopieces.googlecode.com/svn/trunk@1093 74ccb610-7750-0410-82ae-013aeee3265d --- code_ruby/Maasha/lib/seq.rb | 99 +++++++++++++++++++++++++++++++ code_ruby/Maasha/test/test_seq.rb | 80 +++++++++++++++++++++---- 2 files changed, 167 insertions(+), 12 deletions(-) diff --git a/code_ruby/Maasha/lib/seq.rb b/code_ruby/Maasha/lib/seq.rb index e00590c..68d6bad 100644 --- a/code_ruby/Maasha/lib/seq.rb +++ b/code_ruby/Maasha/lib/seq.rb @@ -1,3 +1,4 @@ +# Residue alphabets DNA = %w[a t c g] RNA = %w[a u c g] PROTEIN = %w[f l s y c w p h q r i m t n k v a d e g] @@ -8,6 +9,11 @@ class SeqError < StandardError; end class Seq attr_accessor :seq_name, :seq, :type, :qual + # Initialize a sequence object with the following arguments: + # - seq_name: Name of the sequence. + # - seq: The sequence. + # - type: The sequence type - DNA, RNA, or protein + # - qual: A Solexa type quality scores string. def initialize(seq_name = nil, seq = nil, type = nil, qual = nil) @seq_name = seq_name @seq = seq @@ -15,6 +21,7 @@ class Seq @qual = qual end + # Returns the length of a sequence. def length self.seq.nil? ? 0 : self.seq.length end @@ -26,10 +33,12 @@ class Seq self.type == 'dna' end + # Method that returns true is a given sequence type is RNA. def is_rna? self.type == 'rna' end + # Method that returns true is a given sequence type is protein. def is_protein? self.type == 'protein' end @@ -128,6 +137,96 @@ class Seq end end +# Error class for all exceptions to do with Digest. +class DigestError < StandardError; end + +class Digest + # Initialize a digest object with the following arguments: + # - seq: A sequence object. + # - pattern: A restriction enzyme recognition pattern. + # - cut_pos: Offset from match position where the enzyme cuts. + def initialize(seq, pattern, cut_pos) + @seq = seq + @pattern = disambiguate(pattern) + @cut_pos = cut_pos + end + + # Method that returns a list of positions where + # a specified restriction enzyme will cut the sequence. + def positions + positions = [] + + @seq.seq.scan @pattern do |match| + pos = $`.length + @cut_pos - 1 + + if pos >= 0 and pos < @seq.seq.length - 2 + positions << pos + end + end + + positions + end + + # Method that returns a list of strings with digestion produts + # from the digestion of a specified sequence with a specified + # restriction enzyme. + def products + products = [] + + beg = 0 + + self.positions.each do |pos| + products << @seq.seq[beg .. pos] + beg = pos + 1 + end + + if beg < @seq.seq.length + products << @seq.seq[beg .. @seq.seq.length] + end + + products + end + + private + + # Method that returns a regexp object with a restriction + # enzyme pattern with ambiguity codes substituted to the + # appropriate regexp. + def disambiguate(pattern) + ambiguity = { + 'A' => "A", + 'T' => "T", + 'U' => "T", + 'C' => "C", + 'G' => "G", + 'M' => "[AC]", + 'R' => "[AG]", + 'W' => "[AT]", + 'S' => "[CG]", + 'Y' => "[CT]", + 'K' => "[GT]", + 'V' => "[ACG]", + 'H' => "[ACT]", + 'D' => "[AGT]", + 'B' => "[CGT]", + 'N' => "[GATC]" + } + + new_pattern = "" + + pattern.upcase.each_char do |char| + if ambiguity.has_key? char + new_pattern << ambiguity[char] + else + raise DigestError, "Could not disambiguate residue: #{char}" + end + end + + Regexp.new(new_pattern) + end +end + + __END__ diff --git a/code_ruby/Maasha/test/test_seq.rb b/code_ruby/Maasha/test/test_seq.rb index 52f14af..2e003e2 100755 --- a/code_ruby/Maasha/test/test_seq.rb +++ b/code_ruby/Maasha/test/test_seq.rb @@ -44,7 +44,7 @@ class TestSeq < Test::Unit::TestCase def test_Sequence_length_is_correct @entry.seq = 'ATCG' - assert_equal(@entry.length, 4) + assert_equal(4, @entry.length) end def test_Seq_to_rna_raises_if_no_sequence @@ -61,14 +61,14 @@ class TestSeq < Test::Unit::TestCase def test_Seq_to_rna_transcribes_correctly @entry.seq = 'ATCGatcg' @entry.type = 'dna' - assert_equal(@entry.to_rna, "AUCGaucg") + assert_equal("AUCGaucg", @entry.to_rna) end def test_Seq_to_rna_changes_entry_type_to_rna @entry.seq = 'ATCGatcg' @entry.type = 'dna' @entry.to_rna - assert_equal(@entry.type, "rna") + assert_equal("rna", @entry.type) end def test_Seq_to_dna_raises_if_no_sequence @@ -85,20 +85,20 @@ class TestSeq < Test::Unit::TestCase def test_Seq_to_dna_transcribes_correctly @entry.seq = 'AUCGaucg' @entry.type = 'rna' - assert_equal(@entry.to_dna, "ATCGatcg") + assert_equal("ATCGatcg", @entry.to_dna) end def test_Seq_to_dna_changes_entry_type_to_dna @entry.seq = 'AUCGaucg' @entry.type = 'rna' @entry.to_dna - assert_equal(@entry.type, "dna") + assert_equal("dna", @entry.type) end def test_Seq_to_bp_returns_correct_record @entry.seq_name = 'test' - @entry.seq = 'ATCG' - assert_equal(@entry.to_bp, {"SEQ_NAME"=>"test", "SEQ"=>"ATCG", "SEQ_LEN"=>4}) + @entry.seq = 'ATCG' + assert_equal({:SEQ_NAME=>"test", :SEQ=>"ATCG", :SEQ_LEN=>4}, @entry.to_bp) end def test_Seq_to_bp_raises_on_missing_seq_name @@ -112,15 +112,21 @@ class TestSeq < Test::Unit::TestCase end def test_Seq_to_fasta_returns_correct_entry - entry = Seq.new( "test", "ATCG" ) - assert_equal(">test\nATCG\n", entry.to_fasta) + @entry.seq_name = 'test' + @entry.seq = 'ATCG' + assert_equal(">test\nATCG\n", @entry.to_fasta) end def test_Seq_to_fasta_wraps_correctly - entry = Seq.new( "test", "ATCG" ) + entry = Seq.new("test", "ATCG") assert_equal(">test\nAT\nCG\n", entry.to_fasta(2)) end + def test_Seq_reverse_returns_correctly + @entry.seq = "ATCG" + assert_equal("GCTA", @entry.reverse) + end + def test_Seq_complement_raises_if_no_sequence @entry.type = 'dna' assert_raise(SeqError) { @entry.complement } @@ -135,13 +141,25 @@ class TestSeq < Test::Unit::TestCase def test_Seq_complement_for_DNA_is_correct @entry.seq = 'ATCGatcg' @entry.type = 'dna' - assert_equal(@entry.complement, "TAGCtagc") + assert_equal("TAGCtagc", @entry.complement) end def test_Seq_complement_for_RNA_is_correct @entry.seq = 'AUCGaucg' @entry.type = 'rna' - assert_equal(@entry.complement, "UAGCuagc") + assert_equal("UAGCuagc", @entry.complement) + end + + def test_Seq_reverse_complement_for_DNA_is_correct + @entry.seq = 'ATCGatcg' + @entry.type = 'dna' + assert_equal("cgatCGAT", @entry.reverse_complement) + end + + def test_Seq_reverse_complement_for_RNA_is_correct + @entry.seq = 'AUCGaucg' + @entry.type = 'rna' + assert_equal("cgauCGAU", @entry.reverse_complement) end def test_Seq_generate_raises_if_length_lt_0 @@ -158,6 +176,44 @@ class TestSeq < Test::Unit::TestCase assert_nothing_raised { @entry.generate(10, type) } end end + + def test_Digest_new_raises_on_bad_pattern_residue + assert_raise(DigestError) { Digest.new(@entry, "X", 4) } + end + + def test_Digest_new_dont_raise_on_ok_pattern_residue + assert_nothing_raised { Digest.new(@entry, "AGCUTRYWSMKHDVBNagcutrywsmkhdvbn", 4) } + end + + def test_Digest_positions_return_correctly_at_begin + @entry.seq = "TTTTaaa" + digest = Digest.new(@entry, "TTTT", 0) + assert_equal([], digest.positions) + end + + def test_Digest_positions_return_correctly + @entry.seq = "TTTTaaa" + digest = Digest.new(@entry, "TTTT", 1) + assert_equal([0], digest.positions) + end + + def test_Digest_positions_return_correctly_at_end + @entry.seq = "aaaTTTT" + digest = Digest.new(@entry, "TTTT", 3) + assert_equal([], digest.positions) + end + + def test_Digest_positions_return_correctly_with_ambibuity_pattern + @entry.seq = "aaaTTTT" + digest = Digest.new(@entry, "TTNT", 1) + assert_equal([3], digest.positions) + end + + def test_Digest_products_return_correctly + @entry.seq = "aaaaTTTTbbbbTTTT" + digest = Digest.new(@entry, "TTNT", 1) + assert_equal(["aaaaT", "TTTbbbbT", "TTT"], digest.products) + end end -- 2.39.5