]> git.donarmstrong.com Git - biopieces.git/commitdiff
rewrite of seq.rb and addition of seq/trim.rb
authormartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Wed, 23 May 2012 14:14:54 +0000 (14:14 +0000)
committermartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Wed, 23 May 2012 14:14:54 +0000 (14:14 +0000)
git-svn-id: http://biopieces.googlecode.com/svn/trunk@1821 74ccb610-7750-0410-82ae-013aeee3265d

code_ruby/Rakefile
code_ruby/lib/maasha/seq.rb
code_ruby/lib/maasha/seq/trim.rb [new file with mode: 0644]
code_ruby/test/maasha/seq/test_trim.rb [new file with mode: 0755]
code_ruby/test/maasha/test_seq.rb

index 17437ab807f82003f21bb1d7bb2c3d52aae75295..d685516c2cefe2db133413ed753528b90579b2f4 100644 (file)
@@ -1,6 +1,8 @@
 require 'rake/testtask'
 
-Rake::TestTask.new("test") do |t|
-  t.pattern = "test/maasha/test_*.rb"
+task :default => "test"
+
+Rake::TestTask.new do |t|
+  t.test_files = FileList["test/**/test_*.rb"]
   t.warning = true
 end
index 2382218d1f91f4de415e25c58869b239a368deda..0dcccba996108a0d75fc21de24e21b6abd2e9df1 100644 (file)
@@ -26,6 +26,7 @@ require 'maasha/patternmatcher'
 require 'maasha/bits'
 require 'maasha/backtrack'
 require 'maasha/digest'
+require 'maasha/seq/trim'
 require 'narray'
 #require 'maasha/patscan'
 
@@ -48,6 +49,7 @@ class Seq
   include PatternMatcher
   include BackTrack
   include Digest
+  include Trim
 
   attr_accessor :seq_name, :seq, :type, :qual
 
@@ -319,13 +321,20 @@ class Seq
   # and of a given length.
   def subseq(start, length = self.length - start)
     raise SeqError, "subsequence start: #{start} < 0"                                                if start  < 0
-    raise SeqError, "subsequence length: #{length} < 1"                                              if length <= 0
+    raise SeqError, "subsequence length: #{length} < 0"                                              if length < 0
     raise SeqError, "subsequence start + length > Seq.length: #{start} + #{length} > #{self.length}" if start + length > self.length
 
-    stop = start + length - 1
 
-    seq  = self.seq[start .. stop]
-    qual = self.qual[start .. stop] unless self.qual.nil?
+    if length == 0
+      seq  = ""
+      qual = "" unless self.qual.nil?
+    else
+      stop = start + length - 1
+
+      seq  = self.seq[start .. stop]
+      qual = self.qual[start .. stop] unless self.qual.nil?
+    end
+
 
     Seq.new(self.seq_name, seq, self.type, qual)
   end
@@ -334,13 +343,20 @@ class Seq
   # and of a given length.
   def subseq!(start, length = self.length - start)
     raise SeqError, "subsequence start: #{start} < 0"                                                if start  < 0
-    raise SeqError, "subsequence length: #{length} < 1"                                              if length <= 0
+    raise SeqError, "subsequence length: #{length} < 0"                                              if length < 0
     raise SeqError, "subsequence start + length > Seq.length: #{start} + #{length} > #{self.length}" if start + length > self.length
 
-    stop = start + length - 1
+    if length == 0
+      self.seq  = ""
+      self.qual = "" unless self.qual.nil?
+    else
+      stop = start + length - 1
+
+      self.seq  = self.seq[start .. stop]
+      self.qual = self.qual[start .. stop] unless self.qual.nil?
+    end
 
-    self.seq  = self.seq[start .. stop]
-    self.qual = self.qual[start .. stop] unless self.qual.nil?
+    self
   end
 
   # Method that returns a subsequence of a given length
@@ -355,40 +371,6 @@ class Seq
     self.subseq(start, length)
   end
 
-  def quality_trim_right(min)
-    raise SeqError, "no sequence"      if self.seq.nil?
-    raise SeqError, "no quality score" if self.qual.nil?
-    raise SeqError, "minimum value: #{min} out of range #{SCORE_MIN} .. #{SCORE_MAX}" unless (SCORE_MIN .. SCORE_MAX).include? min
-
-    regex_right = Regexp.new("[#{(SCORE_BASE).chr}-#{(SCORE_BASE + min).chr}]+$")
-
-    self.qual.match(regex_right) do |m|
-      self.subseq!(0, $`.length) if $`.length > 0
-    end
-
-    self
-  end
-
-  def quality_trim_left(min)
-    raise SeqError, "no sequence"      if self.seq.nil?
-    raise SeqError, "no quality score" if self.qual.nil?
-    raise SeqError, "minimum value: #{min} out of range #{SCORE_MIN} .. #{SCORE_MAX}" unless (SCORE_MIN .. SCORE_MAX).include? min
-
-    regex_left  = Regexp.new("^[#{(SCORE_BASE).chr}-#{(SCORE_BASE + min).chr}]+")
-
-    self.qual.match(regex_left) do |m|
-      self.subseq!(m.to_s.length, self.length - m.to_s.length) if self.length - m.to_s.length > 0
-    end
-
-    self
-  end
-
-  def quality_trim(min)
-    self.quality_trim_right(min)
-    self.quality_trim_left(min)
-    self
-  end
-
   # Method that returns the residue compositions of a sequence in
   # a hash where the key is the residue and the value is the residue
   # count.
diff --git a/code_ruby/lib/maasha/seq/trim.rb b/code_ruby/lib/maasha/seq/trim.rb
new file mode 100644 (file)
index 0000000..09dff89
--- /dev/null
@@ -0,0 +1,187 @@
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# This software is part of the Biopieces framework (www.biopieces.org).
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+require 'inline'
+
+# Module containing methods for end trimming sequences with suboptimal quality
+# scores.
+module Trim
+  # Method to progressively trim a Seq object sequence from the right end until
+  # a run of min_len residues with quality scores above min_qual is encountered.
+  def quality_trim_right(min_qual, min_len = 1)
+    check_trim_args(min_qual)
+
+    pos = trim_right_pos_c(self.qual, self.length, min_qual, min_len, SCORE_BASE)
+
+    self.subseq(0, pos)
+  end
+
+  # Method to progressively trim a Seq object sequence from the right end until
+  # a run of min_len residues with quality scores above min_qual is encountered.
+  def quality_trim_right!(min_qual, min_len = 1)
+    check_trim_args(min_qual)
+
+    pos = trim_right_pos_c(self.qual, self.length, min_qual, min_len, SCORE_BASE)
+
+    self.subseq!(0, pos)
+  end
+
+  # Method to progressively trim a Seq object sequence from the left end until
+  # a run of min_len residues with quality scores above min_qual is encountered.
+  def quality_trim_left(min_qual, min_len = 1)
+    check_trim_args(min_qual)
+
+    pos = trim_left_pos_c(self.qual, self.length, min_qual, min_len, SCORE_BASE)
+
+    self.subseq(pos)
+  end
+
+  # Method to progressively trim a Seq object sequence from the left end until
+  # a run of min_len residues with quality scores above min_qual is encountered.
+  def quality_trim_left!(min_qual, min_len = 1)
+    check_trim_args(min_qual)
+
+    pos = trim_left_pos_c(self.qual, self.length, min_qual, min_len, SCORE_BASE)
+
+    self.subseq!(pos)
+  end
+
+  # Method to progressively trim a Seq object sequence from both ends until a
+  # run of min_len residues with quality scores above min_qual is encountered.
+  def quality_trim(min_qual, min_len = 1)
+    check_trim_args(min_qual)
+
+    pos_right = trim_right_pos_c(self.qual, self.length, min_qual, min_len, SCORE_BASE)
+    pos_left  = trim_left_pos_c(self.qual, self.length, min_qual, min_len, SCORE_BASE)
+
+    pos_left = pos_right if pos_left > pos_right
+
+    self.subseq(pos_left, pos_right - pos_left)
+  end
+
+  # Method to progressively trim a Seq object sequence from both ends until a
+  # run of min_len residues with quality scores above min_qual is encountered.
+  def quality_trim!(min_qual, min_len = 1)
+    check_trim_args(min_qual)
+
+    pos_right = trim_right_pos_c(self.qual, self.length, min_qual, min_len, SCORE_BASE)
+    pos_left  = trim_left_pos_c(self.qual, self.length, min_qual, min_len, SCORE_BASE)
+
+    pos_left = pos_right if pos_left > pos_right
+
+    self.subseq!(pos_left, pos_right - pos_left)
+  end
+
+  private
+
+  # Method to check the arguments for trimming and raise on bad sequence, qualities,
+  # and min_qual.
+  def check_trim_args(min_qual)
+    raise SeqError, "no sequence"      if self.seq.nil?
+    raise SeqError, "no quality score" if self.qual.nil?
+    unless (SCORE_MIN .. SCORE_MAX).include? min_qual
+      raise SeqError, "minimum quality value: #{min_qual} out of range #{SCORE_MIN} .. #{SCORE_MAX}"
+    end
+  end
+
+  # Inline C functions for speed below.
+  inline do |builder|
+    # Method for locating the right trim position and return this.
+    builder.c %{
+      VALUE trim_right_pos_c(
+        VALUE _qual,        // quality score string
+        VALUE _len,         // length of quality score string
+        VALUE _min_qual,    // minimum quality score
+        VALUE _min_len,     // minimum quality length
+        VALUE _score_base   // score base
+      )
+      { 
+        unsigned char *qual       = StringValuePtr(_qual);
+        unsigned int   len        = FIX2UINT(_len);
+        unsigned int   min_qual   = FIX2UINT(_min_qual);
+        unsigned int   min_len    = FIX2UINT(_min_len);
+        unsigned int   score_base = FIX2UINT(_score_base);
+
+        unsigned int i = 0;
+        unsigned int c = 0;
+
+        while (i < len)
+        {
+          c = 0;
+
+          while ((c < min_len) && ((c + i) < len) && (qual[len - (c + i) - 1] - score_base > min_qual))
+            c++;
+
+          if (c == min_len)
+            return UINT2NUM(len - i);
+          else
+            i += c;
+
+          i++;
+        }
+
+        return UINT2NUM(0);
+      }
+    }
+
+    # Method for locating the left trim position and return this.
+    builder.c %{
+      VALUE trim_left_pos_c(
+        VALUE _qual,        // quality score string
+        VALUE _len,         // length of quality score string
+        VALUE _min_qual,    // minimum quality score
+        VALUE _min_len,     // minimum quality length
+        VALUE _score_base   // score base
+      )
+      { 
+        unsigned char *qual       = StringValuePtr(_qual);
+        unsigned int   len        = FIX2UINT(_len);
+        unsigned int   min_qual   = FIX2UINT(_min_qual);
+        unsigned int   min_len    = FIX2UINT(_min_len);
+        unsigned int   score_base = FIX2UINT(_score_base);
+
+        unsigned int i = 0;
+        unsigned int c = 0;
+
+        while (i < len)
+        {
+          c = 0;
+
+          while ((c < min_len) && ((c + i) < len) && (qual[c + i] - score_base > min_qual))
+            c++;
+
+          if (c == min_len)
+            return UINT2NUM(i);
+          else
+            i += c;
+
+          i++;
+        }
+
+        return UINT2NUM(i);
+      }
+    }
+  end
+end
+
+__END__
diff --git a/code_ruby/test/maasha/seq/test_trim.rb b/code_ruby/test/maasha/seq/test_trim.rb
new file mode 100755 (executable)
index 0000000..e9695be
--- /dev/null
@@ -0,0 +1,147 @@
+#!/usr/bin/env ruby
+
+require 'maasha/seq'
+require 'test/unit'
+require 'pp'
+
+class TestTrim < Test::Unit::TestCase 
+  def setup
+    @entry = Seq.new
+  end
+
+  def test_Trim_quality_trim_with_missing_seq_raises
+    @entry.qual = "hhhh"
+    assert_raise(SeqError) { @entry.quality_trim_right(20) }
+    assert_raise(SeqError) { @entry.quality_trim_right!(20) }
+    assert_raise(SeqError) { @entry.quality_trim_left(20) }
+    assert_raise(SeqError) { @entry.quality_trim_left!(20) }
+    assert_raise(SeqError) { @entry.quality_trim(20) }
+    assert_raise(SeqError) { @entry.quality_trim!(20) }
+  end
+
+  def test_Trim_quality_trim_with_missing_qual_raises
+    @entry.seq = "ATCG"
+    assert_raise(SeqError) { @entry.quality_trim_right(20) }
+    assert_raise(SeqError) { @entry.quality_trim_right!(20) }
+    assert_raise(SeqError) { @entry.quality_trim_left(20) }
+    assert_raise(SeqError) { @entry.quality_trim_left!(20) }
+    assert_raise(SeqError) { @entry.quality_trim(20) }
+    assert_raise(SeqError) { @entry.quality_trim!(20) }
+  end
+
+  def test_Trim_quality_trim_with_bad_min_raises
+    @entry.seq  = "ATCG"
+    @entry.qual = "hhhh"
+
+    [-1, 41].each do |min|
+      assert_raise(SeqError) { @entry.quality_trim_right(min) }
+      assert_raise(SeqError) { @entry.quality_trim_right!(min) }
+      assert_raise(SeqError) { @entry.quality_trim_left(min) }
+      assert_raise(SeqError) { @entry.quality_trim_left!(min) }
+      assert_raise(SeqError) { @entry.quality_trim(min) }
+      assert_raise(SeqError) { @entry.quality_trim!(min) }
+    end
+  end
+
+  def test_Trim_quality_trim_with_ok_min_dont_raise
+    @entry.seq  = "ATCG"
+    @entry.qual = "hhhh"
+
+    [0, 40].each do |min|
+      assert_nothing_raised { @entry.quality_trim_right(min) }
+      assert_nothing_raised { @entry.quality_trim_right!(min) }
+      assert_nothing_raised { @entry.quality_trim_left(min) }
+      assert_nothing_raised { @entry.quality_trim_left!(min) }
+      assert_nothing_raised { @entry.quality_trim(min) }
+      assert_nothing_raised { @entry.quality_trim!(min) }
+    end
+  end
+
+  def test_Trim_quality_trim_right_returns_correctly
+    @entry.seq  = "AAAAATCG"
+    @entry.qual = "hhhhhgfe"
+    new_entry = @entry.quality_trim_right(38)
+    assert_equal("AAAAAT", new_entry.seq) 
+    assert_equal("hhhhhg", new_entry.qual) 
+    assert_equal("AAAAATCG", @entry.seq) 
+    assert_equal("hhhhhgfe", @entry.qual) 
+  end
+
+  def test_Trim_quality_trim_right_bang_returns_correctly
+    @entry.seq  = "AAAAATCG"
+    @entry.qual = "hhhhhgfe"
+    @entry.quality_trim_right!(38)
+    assert_equal("AAAAAT", @entry.seq) 
+    assert_equal("hhhhhg", @entry.qual) 
+  end
+
+  def test_Trim_quality_trim_right_with_all_low_qual_returns_correctly
+    @entry.seq  = "GCTAAAAA"
+    @entry.qual = "@@@@@@@@"
+    @entry.quality_trim_right!(38)
+    assert_equal("", @entry.seq) 
+    assert_equal("", @entry.qual) 
+  end
+
+  def test_Trim_quality_trim_left_returns_correctly
+    @entry.seq  = "GCTAAAAA"
+    @entry.qual = "efghhhhh"
+    new_entry = @entry.quality_trim_left(38)
+    assert_equal("TAAAAA", new_entry.seq) 
+    assert_equal("ghhhhh", new_entry.qual) 
+    assert_equal("GCTAAAAA", @entry.seq) 
+    assert_equal("efghhhhh", @entry.qual) 
+  end
+
+  def test_Trim_quality_trim_left_bang_returns_correctly
+    @entry.seq  = "GCTAAAAA"
+    @entry.qual = "efghhhhh"
+    @entry.quality_trim_left!(38)
+    assert_equal("TAAAAA", @entry.seq) 
+    assert_equal("ghhhhh", @entry.qual) 
+  end
+
+  def test_Trim_quality_trim_left_with_all_low_qual_returns_correctly
+    @entry.seq  = "GCTAAAAA"
+    @entry.qual = "@@@@@@@@"
+    @entry.quality_trim_left!(38)
+    assert_equal("", @entry.seq) 
+    assert_equal("", @entry.qual) 
+  end
+
+  def test_Trim_quality_trim_returns_correctly
+    @entry.seq  = "GCTAAAAAGTG"
+    @entry.qual = "efghhhhhgfe"
+    new_entry = @entry.quality_trim(38)
+    assert_equal("TAAAAAG", new_entry.seq) 
+    assert_equal("ghhhhhg", new_entry.qual) 
+    assert_equal("GCTAAAAAGTG", @entry.seq) 
+    assert_equal("efghhhhhgfe", @entry.qual) 
+  end
+
+  def test_Trim_quality_trim_bang_returns_correctly
+    @entry.seq  = "GCTAAAAAGTG"
+    @entry.qual = "efghhhhhgfe"
+    @entry.quality_trim!(38)
+    assert_equal("TAAAAAG", @entry.seq) 
+    assert_equal("ghhhhhg", @entry.qual) 
+  end
+
+  def test_Trim_quality_trim_with_min_len_bang_returns_correctly
+    @entry.seq  = "GCTCAAACGTG"
+    @entry.qual = "hdefghgfedh"
+    @entry.quality_trim!(37, 2)
+    assert_equal("CAAAC", @entry.seq) 
+    assert_equal("fghgf", @entry.qual) 
+  end
+
+  def test_Trim_quality_trim_with_all_low_qual_returns_correctly
+    @entry.seq  = "GCTCAAACGTG"
+    @entry.qual = "@@@@@@@@@@@"
+    @entry.quality_trim!(37, 2)
+    assert_equal("", @entry.seq) 
+    assert_equal("", @entry.qual) 
+  end
+end
+
+__END__
index a90c565072503641160ea2c7c2e24fded0e0563b..a8f58ce565d7b0598ee919fa372cfa1991119434 100755 (executable)
@@ -282,11 +282,6 @@ class TestSeq < Test::Unit::TestCase
     assert_raise(SeqError) { @entry.subseq(-1, 1) }
   end
 
-  def test_Seq_subseq_with_length_lt_1_raises
-    @entry.seq = "ATCG"
-    assert_raise(SeqError) { @entry.subseq(0, 0) }
-  end
-
   def test_Seq_subseq_with_start_plus_length_gt_seq_raises
     @entry.seq = "ATCG"
     assert_raise(SeqError) { @entry.subseq(0, 5) }
@@ -323,11 +318,6 @@ class TestSeq < Test::Unit::TestCase
     assert_raise(SeqError) { @entry.subseq!(-1, 1) }
   end
 
-  def test_Seq_subseq_bang_with_length_lt_1_raises
-    @entry.seq = "ATCG"
-    assert_raise(SeqError) { @entry.subseq!(0, 0) }
-  end
-
   def test_Seq_subseq_bang_with_start_plus_length_gt_seq_raises
     @entry.seq = "ATCG"
     assert_raise(SeqError) { @entry.subseq!(0, 5) }
@@ -378,86 +368,6 @@ class TestSeq < Test::Unit::TestCase
     assert_equal("ATCG", @entry.subseq_rand(4).seq)
   end
 
-  def test_Seq_quality_trim_right_with_missing_seq_raises
-    @entry.qual = "hhhh"
-    assert_raise(SeqError) { @entry.quality_trim_right(20) }
-  end
-
-  def test_Seq_quality_trim_right_with_missing_qual_raises
-    @entry.seq = "ATCG"
-    assert_raise(SeqError) { @entry.quality_trim_right(20) }
-  end
-
-  def test_Seq_quality_trim_right_with_bad_min_raises
-    @entry.seq  = "ATCG"
-    @entry.qual = "hhhh"
-
-    [-1, 41].each do |min|
-      assert_raise(SeqError) { @entry.quality_trim_right(min) }
-    end
-  end
-
-  def test_Seq_quality_trim_right_with_ok_min_dont_raise
-    @entry.seq  = "ATCG"
-    @entry.qual = "hhhh"
-
-    [0, 40].each do |min|
-      assert_nothing_raised { @entry.quality_trim_right(min) }
-    end
-  end
-
-  def test_Seq_quality_trim_right_returns_correctly
-    @entry.seq  = "AAAAATCG"
-    @entry.qual = "hhhhhgfe"
-    @entry.quality_trim_right(38)
-    assert_equal("AAAAAT", @entry.seq) 
-    assert_equal("hhhhhg", @entry.qual) 
-  end
-
-  def test_Seq_quality_trim_left_with_missing_seq_raises
-    @entry.qual = "hhhh"
-    assert_raise(SeqError) { @entry.quality_trim_left(20) }
-  end
-
-  def test_Seq_quality_trim_left_with_missing_qual_raises
-    @entry.seq = "ATCG"
-    assert_raise(SeqError) { @entry.quality_trim_left(20) }
-  end
-
-  def test_Seq_quality_trim_left_with_bad_min_raises
-    @entry.seq  = "ATCG"
-    @entry.qual = "hhhh"
-
-    [-1, 41].each do |min|
-      assert_raise(SeqError) { @entry.quality_trim_left(min) }
-    end
-  end
-
-  def test_Seq_quality_trim_left_with_ok_min_dont_raise
-    @entry.seq  = "ATCG"
-    @entry.qual = "hhhh"
-
-    [0, 40].each do |min|
-      assert_nothing_raised { @entry.quality_trim_left(min) }
-    end
-  end
-
-  def test_Seq_quality_trim_left_returns_correctly
-    @entry.seq  = "GCTAAAAA"
-    @entry.qual = "efghhhhh"
-    @entry.quality_trim_left(38)
-    assert_equal("TAAAAA", @entry.seq) 
-    assert_equal("ghhhhh", @entry.qual) 
-  end
-
-  def test_Seq_quality_trim_returns_correctly
-    @entry.seq  = "GCTAAAAAGTG"
-    @entry.qual = "efghhhhhgfe"
-    @entry.quality_trim(38)
-    assert_equal("TAAAAAG", @entry.seq) 
-    assert_equal("ghhhhhg", @entry.qual) 
-  end
-
   def test_Seq_indels_remove_without_qual_returns_correctly
     @entry.seq  = "A-T.CG~CG"
     @entry.qual = nil