From 6b7e0532884d683d9908f53da7eb7f6ba0b029c0 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Thu, 16 Dec 2010 11:08:25 +0000 Subject: [PATCH] added find_homopolymers biopiece git-svn-id: http://biopieces.googlecode.com/svn/trunk@1189 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/find_homopolymers | 56 +++++++++++++++++++++++++++++ bp_test/in/find_homopolymers.in | 16 +++++++++ bp_test/out/find_homopolymers.out.1 | 20 +++++++++++ bp_test/test/test_find_homopolymers | 7 ++++ code_ruby/Maasha/lib/seq.rb | 14 ++++++++ code_ruby/Maasha/test/test_seq.rb | 15 ++++++++ 6 files changed, 128 insertions(+) create mode 100755 bp_bin/find_homopolymers create mode 100644 bp_test/in/find_homopolymers.in create mode 100644 bp_test/out/find_homopolymers.out.1 create mode 100755 bp_test/test/test_find_homopolymers diff --git a/bp_bin/find_homopolymers b/bp_bin/find_homopolymers new file mode 100755 index 0000000..097d651 --- /dev/null +++ b/bp_bin/find_homopolymers @@ -0,0 +1,56 @@ +#!/usr/bin/env ruby + +# Copyright (C) 2007-2010 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This program is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Find homopolymers in sequences in the stream. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +require 'biopieces' +require 'seq' +require 'pp' + +casts = [] + +bp = Biopieces.new + +options = bp.parse(ARGV, casts) + +bp.each_record do |record| + if record.has_key? :SEQ + seq = Seq.new(nil, record[:SEQ]) + + record[:HOMOPOL_MAX] = seq.homopol_max + end + + bp.puts record +end + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ diff --git a/bp_test/in/find_homopolymers.in b/bp_test/in/find_homopolymers.in new file mode 100644 index 0000000..1259c9c --- /dev/null +++ b/bp_test/in/find_homopolymers.in @@ -0,0 +1,16 @@ +SEQ_NAME: test1 +SEQ: attcccggggnnnnn +SEQ_LEN: 15 +--- +SEQ_NAME: test2 +SEQ: nnnnnggggccctta +SEQ_LEN: 15 +--- +SEQ_NAME: test3 +SEQ: a +SEQ_LEN: 1 +--- +SEQ_NAME: test4 +SEQ: aA +SEQ_LEN: 2 +--- diff --git a/bp_test/out/find_homopolymers.out.1 b/bp_test/out/find_homopolymers.out.1 new file mode 100644 index 0000000..fcd0ce9 --- /dev/null +++ b/bp_test/out/find_homopolymers.out.1 @@ -0,0 +1,20 @@ +SEQ_NAME: test1 +SEQ: attcccggggnnnnn +SEQ_LEN: 15 +HOMOPOL_MAX: 5 +--- +SEQ_NAME: test2 +SEQ: nnnnnggggccctta +SEQ_LEN: 15 +HOMOPOL_MAX: 5 +--- +SEQ_NAME: test3 +SEQ: a +SEQ_LEN: 1 +HOMOPOL_MAX: 1 +--- +SEQ_NAME: test4 +SEQ: aA +SEQ_LEN: 2 +HOMOPOL_MAX: 2 +--- diff --git a/bp_test/test/test_find_homopolymers b/bp_test/test/test_find_homopolymers new file mode 100755 index 0000000..61c5709 --- /dev/null +++ b/bp_test/test/test_find_homopolymers @@ -0,0 +1,7 @@ +#!/bin/bash + +source "$BP_DIR/bp_test/lib/test.sh" + +run "$bp -I $in -O $tmp" +assert_no_diff $tmp $out.1 +clean diff --git a/code_ruby/Maasha/lib/seq.rb b/code_ruby/Maasha/lib/seq.rb index 4c28292..a6f071d 100644 --- a/code_ruby/Maasha/lib/seq.rb +++ b/code_ruby/Maasha/lib/seq.rb @@ -162,6 +162,20 @@ class Seq comp end + # Method that returns the length of the longest homopolymeric stretch + # found in a sequence. + def homopol_max + max = 0 + + if self.seq + self.seq.upcase.scan(/[A]+|[T]+|[G]+|[C]+|[N]+/) do |match| + max = [match.size, max].max + end + end + + max + end + # Method that returns the percentage of hard masked residues # or N's in a sequence. def hard_mask diff --git a/code_ruby/Maasha/test/test_seq.rb b/code_ruby/Maasha/test/test_seq.rb index de60d4c..a5a2732 100755 --- a/code_ruby/Maasha/test/test_seq.rb +++ b/code_ruby/Maasha/test/test_seq.rb @@ -191,6 +191,21 @@ class TestSeq < Test::Unit::TestCase assert_equal(0, @entry.composition["X"]) end + def test_Seq_homopol_max_returns_0_with_empty_sequence + @entry.seq = "" + assert_equal(0, @entry.homopol_max) + end + + def test_Seq_homopol_max_returns_0_with_nil_sequence + @entry.seq = nil + assert_equal(0, @entry.homopol_max) + end + + def test_Seq_homopol_max_returns_correctly + @entry.seq = "AtTcCcGggGnnNnn" + assert_equal(5, @entry.homopol_max) + end + def test_Seq_hard_mask_returns_correctly @entry.seq = "--AAAANn" assert_equal(33.33, @entry.hard_mask) -- 2.39.5