From: martinahansen Date: Thu, 14 Oct 2010 14:04:50 +0000 (+0000) Subject: ported analyze_seq to ruby X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=52e3a37dad6b05296ab78a465f2af804ef6ea377;p=biopieces.git ported analyze_seq to ruby git-svn-id: http://biopieces.googlecode.com/svn/trunk@1132 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/code_ruby/Maasha/lib/seq.rb b/code_ruby/Maasha/lib/seq.rb index 52f7e44..039c4ce 100644 --- a/code_ruby/Maasha/lib/seq.rb +++ b/code_ruby/Maasha/lib/seq.rb @@ -2,6 +2,7 @@ DNA = %w[a t c g] RNA = %w[a u c g] PROTEIN = %w[f l s y c w p h q r i m t n k v a d e g] +INDELS = %w[.- _ ~] # Quality scores bases SCORE_PHRED = 33 @@ -32,6 +33,12 @@ class Seq alias len length + # Return the number indels in a sequence. + def indels + regex = Regexp.new(/[#{Regexp.escape(INDELS.join(""))}]/) + self.seq.scan(regex).size + end + # Method that returns true is a given sequence type is DNA. def is_dna? self.type == 'dna' @@ -141,6 +148,31 @@ class Seq seq_new end + # Method that returns the residue compositions of a sequence in + # a hash where the key is the residue and the value is the residue + # count. + def composition + comp = Hash.new(0); + + self.seq.upcase.each_char do |char| + comp[char] += 1 + end + + comp + end + + # Method that returns the percentage of hard masked residues + # or N's in a sequence. + def hard_mask + ((self.seq.upcase.scan("N").size.to_f / (self.len - self.indels).to_f) * 100).round(2) + end + + # Method that returns the percentage of soft masked residues + # or lower cased residues in a sequence. + def soft_mask + ((self.seq.scan(/[a-z]/).size.to_f / (self.len - self.indels).to_f) * 100).round(2) + end + # Method to convert the quality scores from a specified base # to another base. def convert_phred2illumina! @@ -297,14 +329,6 @@ class Seq < String self.replace(self.wrap(width, delimit)) end - # Method that generates a random sequence of a given length. - def generate(length) - raise ArgumentError, "Cannot generate negative sequence length: #{length}." if length <= 0 - - alph = self.residues - Array.new(length) { alph[rand(alph.size)] }.join("") - end - # Method that replaces sequence with a randomly generated sequence of a given length. def generate!(length) self.replace(self.generate(length)) diff --git a/code_ruby/Maasha/test/test_seq.rb b/code_ruby/Maasha/test/test_seq.rb index a8e8166..de60d4c 100755 --- a/code_ruby/Maasha/test/test_seq.rb +++ b/code_ruby/Maasha/test/test_seq.rb @@ -42,11 +42,16 @@ class TestSeq < Test::Unit::TestCase assert(@entry.is_protein? == true) end - def test_Sequence_length_is_correct + def test_Seq_length_is_correct @entry.seq = 'ATCG' assert_equal(4, @entry.length) end + def test_Seq_indels_is_correct + @entry.seq = 'ATCG.-~_' + assert_equal(4, @entry.indels) + end + def test_Seq_to_rna_raises_if_no_sequence @entry.type = 'dna' assert_raise(SeqError) { @entry.to_rna } @@ -177,6 +182,25 @@ class TestSeq < Test::Unit::TestCase end end + def test_Seq_composition_returns_correctly + @entry.seq = "AAAATTTCCG" + assert_equal(4, @entry.composition["A"]) + assert_equal(3, @entry.composition["T"]) + assert_equal(2, @entry.composition["C"]) + assert_equal(1, @entry.composition["G"]) + assert_equal(0, @entry.composition["X"]) + end + + def test_Seq_hard_mask_returns_correctly + @entry.seq = "--AAAANn" + assert_equal(33.33, @entry.hard_mask) + end + + def test_Seq_soft_mask_returns_correctly + @entry.seq = "--AAAa" + assert_equal(25.00, @entry.soft_mask) + end + def test_Digest_new_raises_on_bad_pattern_residue assert_raise(DigestError) { Digest.new(@entry, "X", 4) } end