require 'rake/testtask'
-Rake::TestTask.new("test") do |t|
- t.pattern = "test/maasha/test_*.rb"
+task :default => "test"
+
+Rake::TestTask.new do |t|
+ t.test_files = FileList["test/**/test_*.rb"]
t.warning = true
end
require 'maasha/bits'
require 'maasha/backtrack'
require 'maasha/digest'
+require 'maasha/seq/trim'
require 'narray'
#require 'maasha/patscan'
include PatternMatcher
include BackTrack
include Digest
+ include Trim
attr_accessor :seq_name, :seq, :type, :qual
# and of a given length.
def subseq(start, length = self.length - start)
raise SeqError, "subsequence start: #{start} < 0" if start < 0
- raise SeqError, "subsequence length: #{length} < 1" if length <= 0
+ raise SeqError, "subsequence length: #{length} < 0" if length < 0
raise SeqError, "subsequence start + length > Seq.length: #{start} + #{length} > #{self.length}" if start + length > self.length
- stop = start + length - 1
- seq = self.seq[start .. stop]
- qual = self.qual[start .. stop] unless self.qual.nil?
+ if length == 0
+ seq = ""
+ qual = "" unless self.qual.nil?
+ else
+ stop = start + length - 1
+
+ seq = self.seq[start .. stop]
+ qual = self.qual[start .. stop] unless self.qual.nil?
+ end
+
Seq.new(self.seq_name, seq, self.type, qual)
end
# and of a given length.
def subseq!(start, length = self.length - start)
raise SeqError, "subsequence start: #{start} < 0" if start < 0
- raise SeqError, "subsequence length: #{length} < 1" if length <= 0
+ raise SeqError, "subsequence length: #{length} < 0" if length < 0
raise SeqError, "subsequence start + length > Seq.length: #{start} + #{length} > #{self.length}" if start + length > self.length
- stop = start + length - 1
+ if length == 0
+ self.seq = ""
+ self.qual = "" unless self.qual.nil?
+ else
+ stop = start + length - 1
+
+ self.seq = self.seq[start .. stop]
+ self.qual = self.qual[start .. stop] unless self.qual.nil?
+ end
- self.seq = self.seq[start .. stop]
- self.qual = self.qual[start .. stop] unless self.qual.nil?
+ self
end
# Method that returns a subsequence of a given length
self.subseq(start, length)
end
- def quality_trim_right(min)
- raise SeqError, "no sequence" if self.seq.nil?
- raise SeqError, "no quality score" if self.qual.nil?
- raise SeqError, "minimum value: #{min} out of range #{SCORE_MIN} .. #{SCORE_MAX}" unless (SCORE_MIN .. SCORE_MAX).include? min
-
- regex_right = Regexp.new("[#{(SCORE_BASE).chr}-#{(SCORE_BASE + min).chr}]+$")
-
- self.qual.match(regex_right) do |m|
- self.subseq!(0, $`.length) if $`.length > 0
- end
-
- self
- end
-
- def quality_trim_left(min)
- raise SeqError, "no sequence" if self.seq.nil?
- raise SeqError, "no quality score" if self.qual.nil?
- raise SeqError, "minimum value: #{min} out of range #{SCORE_MIN} .. #{SCORE_MAX}" unless (SCORE_MIN .. SCORE_MAX).include? min
-
- regex_left = Regexp.new("^[#{(SCORE_BASE).chr}-#{(SCORE_BASE + min).chr}]+")
-
- self.qual.match(regex_left) do |m|
- self.subseq!(m.to_s.length, self.length - m.to_s.length) if self.length - m.to_s.length > 0
- end
-
- self
- end
-
- def quality_trim(min)
- self.quality_trim_right(min)
- self.quality_trim_left(min)
- self
- end
-
# Method that returns the residue compositions of a sequence in
# a hash where the key is the residue and the value is the residue
# count.
--- /dev/null
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# This software is part of the Biopieces framework (www.biopieces.org).
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+require 'inline'
+
+# Module containing methods for end trimming sequences with suboptimal quality
+# scores.
+module Trim
+ # Method to progressively trim a Seq object sequence from the right end until
+ # a run of min_len residues with quality scores above min_qual is encountered.
+ def quality_trim_right(min_qual, min_len = 1)
+ check_trim_args(min_qual)
+
+ pos = trim_right_pos_c(self.qual, self.length, min_qual, min_len, SCORE_BASE)
+
+ self.subseq(0, pos)
+ end
+
+ # Method to progressively trim a Seq object sequence from the right end until
+ # a run of min_len residues with quality scores above min_qual is encountered.
+ def quality_trim_right!(min_qual, min_len = 1)
+ check_trim_args(min_qual)
+
+ pos = trim_right_pos_c(self.qual, self.length, min_qual, min_len, SCORE_BASE)
+
+ self.subseq!(0, pos)
+ end
+
+ # Method to progressively trim a Seq object sequence from the left end until
+ # a run of min_len residues with quality scores above min_qual is encountered.
+ def quality_trim_left(min_qual, min_len = 1)
+ check_trim_args(min_qual)
+
+ pos = trim_left_pos_c(self.qual, self.length, min_qual, min_len, SCORE_BASE)
+
+ self.subseq(pos)
+ end
+
+ # Method to progressively trim a Seq object sequence from the left end until
+ # a run of min_len residues with quality scores above min_qual is encountered.
+ def quality_trim_left!(min_qual, min_len = 1)
+ check_trim_args(min_qual)
+
+ pos = trim_left_pos_c(self.qual, self.length, min_qual, min_len, SCORE_BASE)
+
+ self.subseq!(pos)
+ end
+
+ # Method to progressively trim a Seq object sequence from both ends until a
+ # run of min_len residues with quality scores above min_qual is encountered.
+ def quality_trim(min_qual, min_len = 1)
+ check_trim_args(min_qual)
+
+ pos_right = trim_right_pos_c(self.qual, self.length, min_qual, min_len, SCORE_BASE)
+ pos_left = trim_left_pos_c(self.qual, self.length, min_qual, min_len, SCORE_BASE)
+
+ pos_left = pos_right if pos_left > pos_right
+
+ self.subseq(pos_left, pos_right - pos_left)
+ end
+
+ # Method to progressively trim a Seq object sequence from both ends until a
+ # run of min_len residues with quality scores above min_qual is encountered.
+ def quality_trim!(min_qual, min_len = 1)
+ check_trim_args(min_qual)
+
+ pos_right = trim_right_pos_c(self.qual, self.length, min_qual, min_len, SCORE_BASE)
+ pos_left = trim_left_pos_c(self.qual, self.length, min_qual, min_len, SCORE_BASE)
+
+ pos_left = pos_right if pos_left > pos_right
+
+ self.subseq!(pos_left, pos_right - pos_left)
+ end
+
+ private
+
+ # Method to check the arguments for trimming and raise on bad sequence, qualities,
+ # and min_qual.
+ def check_trim_args(min_qual)
+ raise SeqError, "no sequence" if self.seq.nil?
+ raise SeqError, "no quality score" if self.qual.nil?
+ unless (SCORE_MIN .. SCORE_MAX).include? min_qual
+ raise SeqError, "minimum quality value: #{min_qual} out of range #{SCORE_MIN} .. #{SCORE_MAX}"
+ end
+ end
+
+ # Inline C functions for speed below.
+ inline do |builder|
+ # Method for locating the right trim position and return this.
+ builder.c %{
+ VALUE trim_right_pos_c(
+ VALUE _qual, // quality score string
+ VALUE _len, // length of quality score string
+ VALUE _min_qual, // minimum quality score
+ VALUE _min_len, // minimum quality length
+ VALUE _score_base // score base
+ )
+ {
+ unsigned char *qual = StringValuePtr(_qual);
+ unsigned int len = FIX2UINT(_len);
+ unsigned int min_qual = FIX2UINT(_min_qual);
+ unsigned int min_len = FIX2UINT(_min_len);
+ unsigned int score_base = FIX2UINT(_score_base);
+
+ unsigned int i = 0;
+ unsigned int c = 0;
+
+ while (i < len)
+ {
+ c = 0;
+
+ while ((c < min_len) && ((c + i) < len) && (qual[len - (c + i) - 1] - score_base > min_qual))
+ c++;
+
+ if (c == min_len)
+ return UINT2NUM(len - i);
+ else
+ i += c;
+
+ i++;
+ }
+
+ return UINT2NUM(0);
+ }
+ }
+
+ # Method for locating the left trim position and return this.
+ builder.c %{
+ VALUE trim_left_pos_c(
+ VALUE _qual, // quality score string
+ VALUE _len, // length of quality score string
+ VALUE _min_qual, // minimum quality score
+ VALUE _min_len, // minimum quality length
+ VALUE _score_base // score base
+ )
+ {
+ unsigned char *qual = StringValuePtr(_qual);
+ unsigned int len = FIX2UINT(_len);
+ unsigned int min_qual = FIX2UINT(_min_qual);
+ unsigned int min_len = FIX2UINT(_min_len);
+ unsigned int score_base = FIX2UINT(_score_base);
+
+ unsigned int i = 0;
+ unsigned int c = 0;
+
+ while (i < len)
+ {
+ c = 0;
+
+ while ((c < min_len) && ((c + i) < len) && (qual[c + i] - score_base > min_qual))
+ c++;
+
+ if (c == min_len)
+ return UINT2NUM(i);
+ else
+ i += c;
+
+ i++;
+ }
+
+ return UINT2NUM(i);
+ }
+ }
+ end
+end
+
+__END__
--- /dev/null
+#!/usr/bin/env ruby
+
+require 'maasha/seq'
+require 'test/unit'
+require 'pp'
+
+class TestTrim < Test::Unit::TestCase
+ def setup
+ @entry = Seq.new
+ end
+
+ def test_Trim_quality_trim_with_missing_seq_raises
+ @entry.qual = "hhhh"
+ assert_raise(SeqError) { @entry.quality_trim_right(20) }
+ assert_raise(SeqError) { @entry.quality_trim_right!(20) }
+ assert_raise(SeqError) { @entry.quality_trim_left(20) }
+ assert_raise(SeqError) { @entry.quality_trim_left!(20) }
+ assert_raise(SeqError) { @entry.quality_trim(20) }
+ assert_raise(SeqError) { @entry.quality_trim!(20) }
+ end
+
+ def test_Trim_quality_trim_with_missing_qual_raises
+ @entry.seq = "ATCG"
+ assert_raise(SeqError) { @entry.quality_trim_right(20) }
+ assert_raise(SeqError) { @entry.quality_trim_right!(20) }
+ assert_raise(SeqError) { @entry.quality_trim_left(20) }
+ assert_raise(SeqError) { @entry.quality_trim_left!(20) }
+ assert_raise(SeqError) { @entry.quality_trim(20) }
+ assert_raise(SeqError) { @entry.quality_trim!(20) }
+ end
+
+ def test_Trim_quality_trim_with_bad_min_raises
+ @entry.seq = "ATCG"
+ @entry.qual = "hhhh"
+
+ [-1, 41].each do |min|
+ assert_raise(SeqError) { @entry.quality_trim_right(min) }
+ assert_raise(SeqError) { @entry.quality_trim_right!(min) }
+ assert_raise(SeqError) { @entry.quality_trim_left(min) }
+ assert_raise(SeqError) { @entry.quality_trim_left!(min) }
+ assert_raise(SeqError) { @entry.quality_trim(min) }
+ assert_raise(SeqError) { @entry.quality_trim!(min) }
+ end
+ end
+
+ def test_Trim_quality_trim_with_ok_min_dont_raise
+ @entry.seq = "ATCG"
+ @entry.qual = "hhhh"
+
+ [0, 40].each do |min|
+ assert_nothing_raised { @entry.quality_trim_right(min) }
+ assert_nothing_raised { @entry.quality_trim_right!(min) }
+ assert_nothing_raised { @entry.quality_trim_left(min) }
+ assert_nothing_raised { @entry.quality_trim_left!(min) }
+ assert_nothing_raised { @entry.quality_trim(min) }
+ assert_nothing_raised { @entry.quality_trim!(min) }
+ end
+ end
+
+ def test_Trim_quality_trim_right_returns_correctly
+ @entry.seq = "AAAAATCG"
+ @entry.qual = "hhhhhgfe"
+ new_entry = @entry.quality_trim_right(38)
+ assert_equal("AAAAAT", new_entry.seq)
+ assert_equal("hhhhhg", new_entry.qual)
+ assert_equal("AAAAATCG", @entry.seq)
+ assert_equal("hhhhhgfe", @entry.qual)
+ end
+
+ def test_Trim_quality_trim_right_bang_returns_correctly
+ @entry.seq = "AAAAATCG"
+ @entry.qual = "hhhhhgfe"
+ @entry.quality_trim_right!(38)
+ assert_equal("AAAAAT", @entry.seq)
+ assert_equal("hhhhhg", @entry.qual)
+ end
+
+ def test_Trim_quality_trim_right_with_all_low_qual_returns_correctly
+ @entry.seq = "GCTAAAAA"
+ @entry.qual = "@@@@@@@@"
+ @entry.quality_trim_right!(38)
+ assert_equal("", @entry.seq)
+ assert_equal("", @entry.qual)
+ end
+
+ def test_Trim_quality_trim_left_returns_correctly
+ @entry.seq = "GCTAAAAA"
+ @entry.qual = "efghhhhh"
+ new_entry = @entry.quality_trim_left(38)
+ assert_equal("TAAAAA", new_entry.seq)
+ assert_equal("ghhhhh", new_entry.qual)
+ assert_equal("GCTAAAAA", @entry.seq)
+ assert_equal("efghhhhh", @entry.qual)
+ end
+
+ def test_Trim_quality_trim_left_bang_returns_correctly
+ @entry.seq = "GCTAAAAA"
+ @entry.qual = "efghhhhh"
+ @entry.quality_trim_left!(38)
+ assert_equal("TAAAAA", @entry.seq)
+ assert_equal("ghhhhh", @entry.qual)
+ end
+
+ def test_Trim_quality_trim_left_with_all_low_qual_returns_correctly
+ @entry.seq = "GCTAAAAA"
+ @entry.qual = "@@@@@@@@"
+ @entry.quality_trim_left!(38)
+ assert_equal("", @entry.seq)
+ assert_equal("", @entry.qual)
+ end
+
+ def test_Trim_quality_trim_returns_correctly
+ @entry.seq = "GCTAAAAAGTG"
+ @entry.qual = "efghhhhhgfe"
+ new_entry = @entry.quality_trim(38)
+ assert_equal("TAAAAAG", new_entry.seq)
+ assert_equal("ghhhhhg", new_entry.qual)
+ assert_equal("GCTAAAAAGTG", @entry.seq)
+ assert_equal("efghhhhhgfe", @entry.qual)
+ end
+
+ def test_Trim_quality_trim_bang_returns_correctly
+ @entry.seq = "GCTAAAAAGTG"
+ @entry.qual = "efghhhhhgfe"
+ @entry.quality_trim!(38)
+ assert_equal("TAAAAAG", @entry.seq)
+ assert_equal("ghhhhhg", @entry.qual)
+ end
+
+ def test_Trim_quality_trim_with_min_len_bang_returns_correctly
+ @entry.seq = "GCTCAAACGTG"
+ @entry.qual = "hdefghgfedh"
+ @entry.quality_trim!(37, 2)
+ assert_equal("CAAAC", @entry.seq)
+ assert_equal("fghgf", @entry.qual)
+ end
+
+ def test_Trim_quality_trim_with_all_low_qual_returns_correctly
+ @entry.seq = "GCTCAAACGTG"
+ @entry.qual = "@@@@@@@@@@@"
+ @entry.quality_trim!(37, 2)
+ assert_equal("", @entry.seq)
+ assert_equal("", @entry.qual)
+ end
+end
+
+__END__
assert_raise(SeqError) { @entry.subseq(-1, 1) }
end
- def test_Seq_subseq_with_length_lt_1_raises
- @entry.seq = "ATCG"
- assert_raise(SeqError) { @entry.subseq(0, 0) }
- end
-
def test_Seq_subseq_with_start_plus_length_gt_seq_raises
@entry.seq = "ATCG"
assert_raise(SeqError) { @entry.subseq(0, 5) }
assert_raise(SeqError) { @entry.subseq!(-1, 1) }
end
- def test_Seq_subseq_bang_with_length_lt_1_raises
- @entry.seq = "ATCG"
- assert_raise(SeqError) { @entry.subseq!(0, 0) }
- end
-
def test_Seq_subseq_bang_with_start_plus_length_gt_seq_raises
@entry.seq = "ATCG"
assert_raise(SeqError) { @entry.subseq!(0, 5) }
assert_equal("ATCG", @entry.subseq_rand(4).seq)
end
- def test_Seq_quality_trim_right_with_missing_seq_raises
- @entry.qual = "hhhh"
- assert_raise(SeqError) { @entry.quality_trim_right(20) }
- end
-
- def test_Seq_quality_trim_right_with_missing_qual_raises
- @entry.seq = "ATCG"
- assert_raise(SeqError) { @entry.quality_trim_right(20) }
- end
-
- def test_Seq_quality_trim_right_with_bad_min_raises
- @entry.seq = "ATCG"
- @entry.qual = "hhhh"
-
- [-1, 41].each do |min|
- assert_raise(SeqError) { @entry.quality_trim_right(min) }
- end
- end
-
- def test_Seq_quality_trim_right_with_ok_min_dont_raise
- @entry.seq = "ATCG"
- @entry.qual = "hhhh"
-
- [0, 40].each do |min|
- assert_nothing_raised { @entry.quality_trim_right(min) }
- end
- end
-
- def test_Seq_quality_trim_right_returns_correctly
- @entry.seq = "AAAAATCG"
- @entry.qual = "hhhhhgfe"
- @entry.quality_trim_right(38)
- assert_equal("AAAAAT", @entry.seq)
- assert_equal("hhhhhg", @entry.qual)
- end
-
- def test_Seq_quality_trim_left_with_missing_seq_raises
- @entry.qual = "hhhh"
- assert_raise(SeqError) { @entry.quality_trim_left(20) }
- end
-
- def test_Seq_quality_trim_left_with_missing_qual_raises
- @entry.seq = "ATCG"
- assert_raise(SeqError) { @entry.quality_trim_left(20) }
- end
-
- def test_Seq_quality_trim_left_with_bad_min_raises
- @entry.seq = "ATCG"
- @entry.qual = "hhhh"
-
- [-1, 41].each do |min|
- assert_raise(SeqError) { @entry.quality_trim_left(min) }
- end
- end
-
- def test_Seq_quality_trim_left_with_ok_min_dont_raise
- @entry.seq = "ATCG"
- @entry.qual = "hhhh"
-
- [0, 40].each do |min|
- assert_nothing_raised { @entry.quality_trim_left(min) }
- end
- end
-
- def test_Seq_quality_trim_left_returns_correctly
- @entry.seq = "GCTAAAAA"
- @entry.qual = "efghhhhh"
- @entry.quality_trim_left(38)
- assert_equal("TAAAAA", @entry.seq)
- assert_equal("ghhhhh", @entry.qual)
- end
-
- def test_Seq_quality_trim_returns_correctly
- @entry.seq = "GCTAAAAAGTG"
- @entry.qual = "efghhhhhgfe"
- @entry.quality_trim(38)
- assert_equal("TAAAAAG", @entry.seq)
- assert_equal("ghhhhhg", @entry.qual)
- end
-
def test_Seq_indels_remove_without_qual_returns_correctly
@entry.seq = "A-T.CG~CG"
@entry.qual = nil