From 6bbf0485db2edf04a3074f8cdd3ec747b469df7f Mon Sep 17 00:00:00 2001 From: martinahansen Date: Wed, 23 May 2012 14:14:54 +0000 Subject: [PATCH] rewrite of seq.rb and addition of seq/trim.rb git-svn-id: http://biopieces.googlecode.com/svn/trunk@1821 74ccb610-7750-0410-82ae-013aeee3265d --- code_ruby/Rakefile | 6 +- code_ruby/lib/maasha/seq.rb | 66 ++++----- code_ruby/lib/maasha/seq/trim.rb | 187 +++++++++++++++++++++++++ code_ruby/test/maasha/seq/test_trim.rb | 147 +++++++++++++++++++ code_ruby/test/maasha/test_seq.rb | 90 ------------ 5 files changed, 362 insertions(+), 134 deletions(-) create mode 100644 code_ruby/lib/maasha/seq/trim.rb create mode 100755 code_ruby/test/maasha/seq/test_trim.rb diff --git a/code_ruby/Rakefile b/code_ruby/Rakefile index 17437ab..d685516 100644 --- a/code_ruby/Rakefile +++ b/code_ruby/Rakefile @@ -1,6 +1,8 @@ require 'rake/testtask' -Rake::TestTask.new("test") do |t| - t.pattern = "test/maasha/test_*.rb" +task :default => "test" + +Rake::TestTask.new do |t| + t.test_files = FileList["test/**/test_*.rb"] t.warning = true end diff --git a/code_ruby/lib/maasha/seq.rb b/code_ruby/lib/maasha/seq.rb index 2382218..0dcccba 100644 --- a/code_ruby/lib/maasha/seq.rb +++ b/code_ruby/lib/maasha/seq.rb @@ -26,6 +26,7 @@ require 'maasha/patternmatcher' require 'maasha/bits' require 'maasha/backtrack' require 'maasha/digest' +require 'maasha/seq/trim' require 'narray' #require 'maasha/patscan' @@ -48,6 +49,7 @@ class Seq include PatternMatcher include BackTrack include Digest + include Trim attr_accessor :seq_name, :seq, :type, :qual @@ -319,13 +321,20 @@ class Seq # and of a given length. def subseq(start, length = self.length - start) raise SeqError, "subsequence start: #{start} < 0" if start < 0 - raise SeqError, "subsequence length: #{length} < 1" if length <= 0 + raise SeqError, "subsequence length: #{length} < 0" if length < 0 raise SeqError, "subsequence start + length > Seq.length: #{start} + #{length} > #{self.length}" if start + length > self.length - stop = start + length - 1 - seq = self.seq[start .. stop] - qual = self.qual[start .. stop] unless self.qual.nil? + if length == 0 + seq = "" + qual = "" unless self.qual.nil? + else + stop = start + length - 1 + + seq = self.seq[start .. stop] + qual = self.qual[start .. stop] unless self.qual.nil? + end + Seq.new(self.seq_name, seq, self.type, qual) end @@ -334,13 +343,20 @@ class Seq # and of a given length. def subseq!(start, length = self.length - start) raise SeqError, "subsequence start: #{start} < 0" if start < 0 - raise SeqError, "subsequence length: #{length} < 1" if length <= 0 + raise SeqError, "subsequence length: #{length} < 0" if length < 0 raise SeqError, "subsequence start + length > Seq.length: #{start} + #{length} > #{self.length}" if start + length > self.length - stop = start + length - 1 + if length == 0 + self.seq = "" + self.qual = "" unless self.qual.nil? + else + stop = start + length - 1 + + self.seq = self.seq[start .. stop] + self.qual = self.qual[start .. stop] unless self.qual.nil? + end - self.seq = self.seq[start .. stop] - self.qual = self.qual[start .. stop] unless self.qual.nil? + self end # Method that returns a subsequence of a given length @@ -355,40 +371,6 @@ class Seq self.subseq(start, length) end - def quality_trim_right(min) - raise SeqError, "no sequence" if self.seq.nil? - raise SeqError, "no quality score" if self.qual.nil? - raise SeqError, "minimum value: #{min} out of range #{SCORE_MIN} .. #{SCORE_MAX}" unless (SCORE_MIN .. SCORE_MAX).include? min - - regex_right = Regexp.new("[#{(SCORE_BASE).chr}-#{(SCORE_BASE + min).chr}]+$") - - self.qual.match(regex_right) do |m| - self.subseq!(0, $`.length) if $`.length > 0 - end - - self - end - - def quality_trim_left(min) - raise SeqError, "no sequence" if self.seq.nil? - raise SeqError, "no quality score" if self.qual.nil? - raise SeqError, "minimum value: #{min} out of range #{SCORE_MIN} .. #{SCORE_MAX}" unless (SCORE_MIN .. SCORE_MAX).include? min - - regex_left = Regexp.new("^[#{(SCORE_BASE).chr}-#{(SCORE_BASE + min).chr}]+") - - self.qual.match(regex_left) do |m| - self.subseq!(m.to_s.length, self.length - m.to_s.length) if self.length - m.to_s.length > 0 - end - - self - end - - def quality_trim(min) - self.quality_trim_right(min) - self.quality_trim_left(min) - self - end - # Method that returns the residue compositions of a sequence in # a hash where the key is the residue and the value is the residue # count. diff --git a/code_ruby/lib/maasha/seq/trim.rb b/code_ruby/lib/maasha/seq/trim.rb new file mode 100644 index 0000000..09dff89 --- /dev/null +++ b/code_ruby/lib/maasha/seq/trim.rb @@ -0,0 +1,187 @@ + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This software is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +require 'inline' + +# Module containing methods for end trimming sequences with suboptimal quality +# scores. +module Trim + # Method to progressively trim a Seq object sequence from the right end until + # a run of min_len residues with quality scores above min_qual is encountered. + def quality_trim_right(min_qual, min_len = 1) + check_trim_args(min_qual) + + pos = trim_right_pos_c(self.qual, self.length, min_qual, min_len, SCORE_BASE) + + self.subseq(0, pos) + end + + # Method to progressively trim a Seq object sequence from the right end until + # a run of min_len residues with quality scores above min_qual is encountered. + def quality_trim_right!(min_qual, min_len = 1) + check_trim_args(min_qual) + + pos = trim_right_pos_c(self.qual, self.length, min_qual, min_len, SCORE_BASE) + + self.subseq!(0, pos) + end + + # Method to progressively trim a Seq object sequence from the left end until + # a run of min_len residues with quality scores above min_qual is encountered. + def quality_trim_left(min_qual, min_len = 1) + check_trim_args(min_qual) + + pos = trim_left_pos_c(self.qual, self.length, min_qual, min_len, SCORE_BASE) + + self.subseq(pos) + end + + # Method to progressively trim a Seq object sequence from the left end until + # a run of min_len residues with quality scores above min_qual is encountered. + def quality_trim_left!(min_qual, min_len = 1) + check_trim_args(min_qual) + + pos = trim_left_pos_c(self.qual, self.length, min_qual, min_len, SCORE_BASE) + + self.subseq!(pos) + end + + # Method to progressively trim a Seq object sequence from both ends until a + # run of min_len residues with quality scores above min_qual is encountered. + def quality_trim(min_qual, min_len = 1) + check_trim_args(min_qual) + + pos_right = trim_right_pos_c(self.qual, self.length, min_qual, min_len, SCORE_BASE) + pos_left = trim_left_pos_c(self.qual, self.length, min_qual, min_len, SCORE_BASE) + + pos_left = pos_right if pos_left > pos_right + + self.subseq(pos_left, pos_right - pos_left) + end + + # Method to progressively trim a Seq object sequence from both ends until a + # run of min_len residues with quality scores above min_qual is encountered. + def quality_trim!(min_qual, min_len = 1) + check_trim_args(min_qual) + + pos_right = trim_right_pos_c(self.qual, self.length, min_qual, min_len, SCORE_BASE) + pos_left = trim_left_pos_c(self.qual, self.length, min_qual, min_len, SCORE_BASE) + + pos_left = pos_right if pos_left > pos_right + + self.subseq!(pos_left, pos_right - pos_left) + end + + private + + # Method to check the arguments for trimming and raise on bad sequence, qualities, + # and min_qual. + def check_trim_args(min_qual) + raise SeqError, "no sequence" if self.seq.nil? + raise SeqError, "no quality score" if self.qual.nil? + unless (SCORE_MIN .. SCORE_MAX).include? min_qual + raise SeqError, "minimum quality value: #{min_qual} out of range #{SCORE_MIN} .. #{SCORE_MAX}" + end + end + + # Inline C functions for speed below. + inline do |builder| + # Method for locating the right trim position and return this. + builder.c %{ + VALUE trim_right_pos_c( + VALUE _qual, // quality score string + VALUE _len, // length of quality score string + VALUE _min_qual, // minimum quality score + VALUE _min_len, // minimum quality length + VALUE _score_base // score base + ) + { + unsigned char *qual = StringValuePtr(_qual); + unsigned int len = FIX2UINT(_len); + unsigned int min_qual = FIX2UINT(_min_qual); + unsigned int min_len = FIX2UINT(_min_len); + unsigned int score_base = FIX2UINT(_score_base); + + unsigned int i = 0; + unsigned int c = 0; + + while (i < len) + { + c = 0; + + while ((c < min_len) && ((c + i) < len) && (qual[len - (c + i) - 1] - score_base > min_qual)) + c++; + + if (c == min_len) + return UINT2NUM(len - i); + else + i += c; + + i++; + } + + return UINT2NUM(0); + } + } + + # Method for locating the left trim position and return this. + builder.c %{ + VALUE trim_left_pos_c( + VALUE _qual, // quality score string + VALUE _len, // length of quality score string + VALUE _min_qual, // minimum quality score + VALUE _min_len, // minimum quality length + VALUE _score_base // score base + ) + { + unsigned char *qual = StringValuePtr(_qual); + unsigned int len = FIX2UINT(_len); + unsigned int min_qual = FIX2UINT(_min_qual); + unsigned int min_len = FIX2UINT(_min_len); + unsigned int score_base = FIX2UINT(_score_base); + + unsigned int i = 0; + unsigned int c = 0; + + while (i < len) + { + c = 0; + + while ((c < min_len) && ((c + i) < len) && (qual[c + i] - score_base > min_qual)) + c++; + + if (c == min_len) + return UINT2NUM(i); + else + i += c; + + i++; + } + + return UINT2NUM(i); + } + } + end +end + +__END__ diff --git a/code_ruby/test/maasha/seq/test_trim.rb b/code_ruby/test/maasha/seq/test_trim.rb new file mode 100755 index 0000000..e9695be --- /dev/null +++ b/code_ruby/test/maasha/seq/test_trim.rb @@ -0,0 +1,147 @@ +#!/usr/bin/env ruby + +require 'maasha/seq' +require 'test/unit' +require 'pp' + +class TestTrim < Test::Unit::TestCase + def setup + @entry = Seq.new + end + + def test_Trim_quality_trim_with_missing_seq_raises + @entry.qual = "hhhh" + assert_raise(SeqError) { @entry.quality_trim_right(20) } + assert_raise(SeqError) { @entry.quality_trim_right!(20) } + assert_raise(SeqError) { @entry.quality_trim_left(20) } + assert_raise(SeqError) { @entry.quality_trim_left!(20) } + assert_raise(SeqError) { @entry.quality_trim(20) } + assert_raise(SeqError) { @entry.quality_trim!(20) } + end + + def test_Trim_quality_trim_with_missing_qual_raises + @entry.seq = "ATCG" + assert_raise(SeqError) { @entry.quality_trim_right(20) } + assert_raise(SeqError) { @entry.quality_trim_right!(20) } + assert_raise(SeqError) { @entry.quality_trim_left(20) } + assert_raise(SeqError) { @entry.quality_trim_left!(20) } + assert_raise(SeqError) { @entry.quality_trim(20) } + assert_raise(SeqError) { @entry.quality_trim!(20) } + end + + def test_Trim_quality_trim_with_bad_min_raises + @entry.seq = "ATCG" + @entry.qual = "hhhh" + + [-1, 41].each do |min| + assert_raise(SeqError) { @entry.quality_trim_right(min) } + assert_raise(SeqError) { @entry.quality_trim_right!(min) } + assert_raise(SeqError) { @entry.quality_trim_left(min) } + assert_raise(SeqError) { @entry.quality_trim_left!(min) } + assert_raise(SeqError) { @entry.quality_trim(min) } + assert_raise(SeqError) { @entry.quality_trim!(min) } + end + end + + def test_Trim_quality_trim_with_ok_min_dont_raise + @entry.seq = "ATCG" + @entry.qual = "hhhh" + + [0, 40].each do |min| + assert_nothing_raised { @entry.quality_trim_right(min) } + assert_nothing_raised { @entry.quality_trim_right!(min) } + assert_nothing_raised { @entry.quality_trim_left(min) } + assert_nothing_raised { @entry.quality_trim_left!(min) } + assert_nothing_raised { @entry.quality_trim(min) } + assert_nothing_raised { @entry.quality_trim!(min) } + end + end + + def test_Trim_quality_trim_right_returns_correctly + @entry.seq = "AAAAATCG" + @entry.qual = "hhhhhgfe" + new_entry = @entry.quality_trim_right(38) + assert_equal("AAAAAT", new_entry.seq) + assert_equal("hhhhhg", new_entry.qual) + assert_equal("AAAAATCG", @entry.seq) + assert_equal("hhhhhgfe", @entry.qual) + end + + def test_Trim_quality_trim_right_bang_returns_correctly + @entry.seq = "AAAAATCG" + @entry.qual = "hhhhhgfe" + @entry.quality_trim_right!(38) + assert_equal("AAAAAT", @entry.seq) + assert_equal("hhhhhg", @entry.qual) + end + + def test_Trim_quality_trim_right_with_all_low_qual_returns_correctly + @entry.seq = "GCTAAAAA" + @entry.qual = "@@@@@@@@" + @entry.quality_trim_right!(38) + assert_equal("", @entry.seq) + assert_equal("", @entry.qual) + end + + def test_Trim_quality_trim_left_returns_correctly + @entry.seq = "GCTAAAAA" + @entry.qual = "efghhhhh" + new_entry = @entry.quality_trim_left(38) + assert_equal("TAAAAA", new_entry.seq) + assert_equal("ghhhhh", new_entry.qual) + assert_equal("GCTAAAAA", @entry.seq) + assert_equal("efghhhhh", @entry.qual) + end + + def test_Trim_quality_trim_left_bang_returns_correctly + @entry.seq = "GCTAAAAA" + @entry.qual = "efghhhhh" + @entry.quality_trim_left!(38) + assert_equal("TAAAAA", @entry.seq) + assert_equal("ghhhhh", @entry.qual) + end + + def test_Trim_quality_trim_left_with_all_low_qual_returns_correctly + @entry.seq = "GCTAAAAA" + @entry.qual = "@@@@@@@@" + @entry.quality_trim_left!(38) + assert_equal("", @entry.seq) + assert_equal("", @entry.qual) + end + + def test_Trim_quality_trim_returns_correctly + @entry.seq = "GCTAAAAAGTG" + @entry.qual = "efghhhhhgfe" + new_entry = @entry.quality_trim(38) + assert_equal("TAAAAAG", new_entry.seq) + assert_equal("ghhhhhg", new_entry.qual) + assert_equal("GCTAAAAAGTG", @entry.seq) + assert_equal("efghhhhhgfe", @entry.qual) + end + + def test_Trim_quality_trim_bang_returns_correctly + @entry.seq = "GCTAAAAAGTG" + @entry.qual = "efghhhhhgfe" + @entry.quality_trim!(38) + assert_equal("TAAAAAG", @entry.seq) + assert_equal("ghhhhhg", @entry.qual) + end + + def test_Trim_quality_trim_with_min_len_bang_returns_correctly + @entry.seq = "GCTCAAACGTG" + @entry.qual = "hdefghgfedh" + @entry.quality_trim!(37, 2) + assert_equal("CAAAC", @entry.seq) + assert_equal("fghgf", @entry.qual) + end + + def test_Trim_quality_trim_with_all_low_qual_returns_correctly + @entry.seq = "GCTCAAACGTG" + @entry.qual = "@@@@@@@@@@@" + @entry.quality_trim!(37, 2) + assert_equal("", @entry.seq) + assert_equal("", @entry.qual) + end +end + +__END__ diff --git a/code_ruby/test/maasha/test_seq.rb b/code_ruby/test/maasha/test_seq.rb index a90c565..a8f58ce 100755 --- a/code_ruby/test/maasha/test_seq.rb +++ b/code_ruby/test/maasha/test_seq.rb @@ -282,11 +282,6 @@ class TestSeq < Test::Unit::TestCase assert_raise(SeqError) { @entry.subseq(-1, 1) } end - def test_Seq_subseq_with_length_lt_1_raises - @entry.seq = "ATCG" - assert_raise(SeqError) { @entry.subseq(0, 0) } - end - def test_Seq_subseq_with_start_plus_length_gt_seq_raises @entry.seq = "ATCG" assert_raise(SeqError) { @entry.subseq(0, 5) } @@ -323,11 +318,6 @@ class TestSeq < Test::Unit::TestCase assert_raise(SeqError) { @entry.subseq!(-1, 1) } end - def test_Seq_subseq_bang_with_length_lt_1_raises - @entry.seq = "ATCG" - assert_raise(SeqError) { @entry.subseq!(0, 0) } - end - def test_Seq_subseq_bang_with_start_plus_length_gt_seq_raises @entry.seq = "ATCG" assert_raise(SeqError) { @entry.subseq!(0, 5) } @@ -378,86 +368,6 @@ class TestSeq < Test::Unit::TestCase assert_equal("ATCG", @entry.subseq_rand(4).seq) end - def test_Seq_quality_trim_right_with_missing_seq_raises - @entry.qual = "hhhh" - assert_raise(SeqError) { @entry.quality_trim_right(20) } - end - - def test_Seq_quality_trim_right_with_missing_qual_raises - @entry.seq = "ATCG" - assert_raise(SeqError) { @entry.quality_trim_right(20) } - end - - def test_Seq_quality_trim_right_with_bad_min_raises - @entry.seq = "ATCG" - @entry.qual = "hhhh" - - [-1, 41].each do |min| - assert_raise(SeqError) { @entry.quality_trim_right(min) } - end - end - - def test_Seq_quality_trim_right_with_ok_min_dont_raise - @entry.seq = "ATCG" - @entry.qual = "hhhh" - - [0, 40].each do |min| - assert_nothing_raised { @entry.quality_trim_right(min) } - end - end - - def test_Seq_quality_trim_right_returns_correctly - @entry.seq = "AAAAATCG" - @entry.qual = "hhhhhgfe" - @entry.quality_trim_right(38) - assert_equal("AAAAAT", @entry.seq) - assert_equal("hhhhhg", @entry.qual) - end - - def test_Seq_quality_trim_left_with_missing_seq_raises - @entry.qual = "hhhh" - assert_raise(SeqError) { @entry.quality_trim_left(20) } - end - - def test_Seq_quality_trim_left_with_missing_qual_raises - @entry.seq = "ATCG" - assert_raise(SeqError) { @entry.quality_trim_left(20) } - end - - def test_Seq_quality_trim_left_with_bad_min_raises - @entry.seq = "ATCG" - @entry.qual = "hhhh" - - [-1, 41].each do |min| - assert_raise(SeqError) { @entry.quality_trim_left(min) } - end - end - - def test_Seq_quality_trim_left_with_ok_min_dont_raise - @entry.seq = "ATCG" - @entry.qual = "hhhh" - - [0, 40].each do |min| - assert_nothing_raised { @entry.quality_trim_left(min) } - end - end - - def test_Seq_quality_trim_left_returns_correctly - @entry.seq = "GCTAAAAA" - @entry.qual = "efghhhhh" - @entry.quality_trim_left(38) - assert_equal("TAAAAA", @entry.seq) - assert_equal("ghhhhh", @entry.qual) - end - - def test_Seq_quality_trim_returns_correctly - @entry.seq = "GCTAAAAAGTG" - @entry.qual = "efghhhhhgfe" - @entry.quality_trim(38) - assert_equal("TAAAAAG", @entry.seq) - assert_equal("ghhhhhg", @entry.qual) - end - def test_Seq_indels_remove_without_qual_returns_correctly @entry.seq = "A-T.CG~CG" @entry.qual = nil -- 2.39.2