From ab8587a15ef93d08a989720858bbb5d5785a0262 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Sun, 20 Feb 2011 13:06:22 +0000 Subject: [PATCH] added remove_illumina_adaptor Biopiece git-svn-id: http://biopieces.googlecode.com/svn/trunk@1277 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/remove_illumina_adaptor | 75 +++++++++++++++++++++++++++++++ code_ruby/Maasha/lib/seq.rb | 17 ++++++- code_ruby/Maasha/test/test_seq.rb | 68 ++++++++++++++++++++++++++++ 3 files changed, 158 insertions(+), 2 deletions(-) create mode 100755 bp_bin/remove_illumina_adaptor diff --git a/bp_bin/remove_illumina_adaptor b/bp_bin/remove_illumina_adaptor new file mode 100755 index 0000000..ad52cc5 --- /dev/null +++ b/bp_bin/remove_illumina_adaptor @@ -0,0 +1,75 @@ +#!/usr/bin/env ruby + +# Copyright (C) 2007-2011 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This program is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Remove Illumina adaptors or parts thereof from sequences in the stream. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +require 'biopieces' +require 'seq' + +casts = [] +casts << {:long=>'right', :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'left', :short=>'l', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} + +bp = Biopieces.new + +options = bp.parse(ARGV, casts) + +bp.each_record do |record| + if record.has_key? :SEQ + entry = Seq.new(record[:SEQ_NAME], record[:SEQ], record[:TYPE], record[:SCORES]) + + if options[:right] + pos_right = entry.adaptor_locate_right(options[:right]) + entry.subseq!(0, pos_right) if pos_right >= 0 + record[:CLIP_ADAPTOR_RIGHT] = pos_right + else + record[:CLIP_ADAPTOR_RIGHT] = -1 + end + + if options[:left] + pos_left = entry.adaptor_locate_left(options[:left]) + entry.subseq!(pos_left + 1) if pos_left >= 0 + record[:CLIP_ADAPTOR_LEFT] = pos_left + else + record[:CLIP_ADAPTOR_LEFT] = -1 + end + + record[:SEQ] = entry.seq + record[:SEQ_LEN] = entry.len + record[:SCORES] = entry.qual + end + + bp.puts record +end + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ diff --git a/code_ruby/Maasha/lib/seq.rb b/code_ruby/Maasha/lib/seq.rb index de13ec2..19cd1b1 100644 --- a/code_ruby/Maasha/lib/seq.rb +++ b/code_ruby/Maasha/lib/seq.rb @@ -223,8 +223,8 @@ class Seq # Method that returns a subsequence of from a given start position # and of a given length. - def subseq(start, length) - raise SeqError, "sunsequence start: #{start} < 0" if start < 0 + def subseq(start, length = self.length - start) + raise SeqError, "subsequence start: #{start} < 0" if start < 0 raise SeqError, "subsequence length: #{length} < 1" if length <= 0 raise SeqError, "subsequence start + length > Seq.length: #{start} + #{length} > #{self.length}" if start + length > self.length @@ -236,6 +236,19 @@ class Seq Seq.new(self.seq_name, seq, self.type, qual) end + # Method that replaces a sequence with a subsequence from a given start position + # and of a given length. + def subseq!(start, length = self.length - start) + raise SeqError, "subsequence start: #{start} < 0" if start < 0 + raise SeqError, "subsequence length: #{length} < 1" if length <= 0 + raise SeqError, "subsequence start + length > Seq.length: #{start} + #{length} > #{self.length}" if start + length > self.length + + stop = start + length - 1 + + self.seq = self.seq[start .. stop] + self.qual = self.qual[start .. stop] unless self.qual.nil? + end + # Method that returns a subsequence of a given length # beginning at a random position. def subseq_rand(length) diff --git a/code_ruby/Maasha/test/test_seq.rb b/code_ruby/Maasha/test/test_seq.rb index 2482422..9296e14 100755 --- a/code_ruby/Maasha/test/test_seq.rb +++ b/code_ruby/Maasha/test/test_seq.rb @@ -213,6 +213,12 @@ class TestSeq < Test::Unit::TestCase assert_equal("CG", @entry.subseq(2, 2).seq) end + def test_Seq_subseq_without_len_returns_correct_sequence + @entry.seq = "ATCG" + assert_equal("ATCG", @entry.subseq(0).seq) + assert_equal("CG", @entry.subseq(2).seq) + end + def test_Seq_subseq_returns_correct_qual @entry.seq = "ATCG" @entry.qual = "abcd" @@ -220,6 +226,68 @@ class TestSeq < Test::Unit::TestCase assert_equal("cd", @entry.subseq(2, 2).qual) end + def test_Seq_subseq_without_len_returns_correct_qual + @entry.seq = "ATCG" + @entry.qual = "abcd" + assert_equal("abcd", @entry.subseq(0).qual) + assert_equal("cd", @entry.subseq(2).qual) + end + + def test_Seq_subseq_bang_with_start_lt_0_raises + @entry.seq = "ATCG" + assert_raise(SeqError) { @entry.subseq!(-1, 1) } + end + + def test_Seq_subseq_bang_with_length_lt_1_raises + @entry.seq = "ATCG" + assert_raise(SeqError) { @entry.subseq!(0, 0) } + end + + def test_Seq_subseq_bang_with_start_plus_length_gt_seq_raises + @entry.seq = "ATCG" + assert_raise(SeqError) { @entry.subseq!(0, 5) } + end + + def test_Seq_subseq_bang_returns_correct_sequence + @entry.seq = "ATCG" + @entry.subseq!(0, 2) + assert_equal("AT", @entry.seq) + @entry.seq = "ATCG" + @entry.subseq!(2, 2) + assert_equal("CG", @entry.seq) + end + + def test_Seq_subseq_bang_without_len_returns_correct_sequence + @entry.seq = "ATCG" + @entry.subseq!(0) + assert_equal("ATCG", @entry.seq) + @entry.seq = "ATCG" + @entry.subseq!(2) + assert_equal("CG", @entry.seq) + end + + def test_Seq_subseq_bang_returns_correct_qual + @entry.seq = "ATCG" + @entry.qual = "abcd" + @entry.subseq!(0, 2) + assert_equal("ab", @entry.qual) + @entry.seq = "ATCG" + @entry.qual = "abcd" + @entry.subseq!(2, 2) + assert_equal("cd", @entry.qual) + end + + def test_Seq_subseq_bang_returns_correct_qual + @entry.seq = "ATCG" + @entry.qual = "abcd" + @entry.subseq!(0) + assert_equal("abcd", @entry.qual) + @entry.seq = "ATCG" + @entry.qual = "abcd" + @entry.subseq!(2) + assert_equal("cd", @entry.qual) + end + def test_Seq_subseq_rand_returns_correct_sequence @entry.seq = "ATCG" assert_equal("ATCG", @entry.subseq_rand(4).seq) -- 2.39.5