From 740804b81930af277b29c64f8b8276daff633f44 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Tue, 4 Jun 2013 14:00:16 +0000 Subject: [PATCH] revamped find_homopolymers git-svn-id: http://biopieces.googlecode.com/svn/trunk@2174 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/find_homopolymers | 47 +++++++++++++++++-- bp_test/out/find_homopolymers.out.1 | 72 +++++++++++++++++++++++++++-- bp_test/out/find_homopolymers.out.2 | 38 +++++++++++++-- bp_test/test/test_find_homopolymers | 8 ++++ code_ruby/lib/maasha/seq.rb | 22 ++------- code_ruby/test/maasha/test_seq.rb | 20 -------- 6 files changed, 155 insertions(+), 52 deletions(-) diff --git a/bp_bin/find_homopolymers b/bp_bin/find_homopolymers index 89b8e8c..c0885e3 100755 --- a/bp_bin/find_homopolymers +++ b/bp_bin/find_homopolymers @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -# Copyright (C) 2007-2010 Martin A. Hansen. +# Copyright (C) 2007-2013 Martin A. Hansen. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -31,9 +31,14 @@ require 'maasha/biopieces' require 'maasha/seq' +require 'pp' + +class Seq; include Homopolymer; end casts = [] -casts << {:long=>'min', :short=>'m', :type=>'uint', :mandatory=>false, :default=>1, :allowed=>nil, :disallowed=>"0"} +casts << {:long=>'min', :short=>'m', :type=>'uint', :mandatory=>false, :default=>1, :allowed=>nil, :disallowed=>"0"} +casts << {:long=>'limit', :short=>'l', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>"0"} +casts << {:long=>'longest', :short=>'L', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} options = Biopieces.options_parse(ARGV, casts) @@ -42,10 +47,42 @@ Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| if record[:SEQ] seq = Seq.new(nil, record[:SEQ]) - record[:HOMOPOL_MAX] = seq.homopol_max(options[:min]) + longest = Seq::Homopolymer.new("", 0, 0) + got_one = false + count = 0 + + seq.each_homopolymer(options[:min]) do |h| + got_one = true + + record[:HOMOPOL_PAT] = h.pattern + record[:HOMOPOL_LEN] = h.length + record[:HOMOPOL_POS] = h.pos + + if options[:longest] + longest = h.length > longest.length ? h : longest + else + output.puts record + + count += 1 + + break if options[:limit] and options[:limit] == count + end + end + + if options[:longest] + if longest.length > options[:min] + record[:HOMOPOL_PAT] = longest.pattern + record[:HOMOPOL_LEN] = longest.length + record[:HOMOPOL_POS] = longest.pos + end + + output.puts record + elsif not got_one + output.puts record + end + else + output.puts record end - - output.puts record end end diff --git a/bp_test/out/find_homopolymers.out.1 b/bp_test/out/find_homopolymers.out.1 index fcd0ce9..105c6a7 100644 --- a/bp_test/out/find_homopolymers.out.1 +++ b/bp_test/out/find_homopolymers.out.1 @@ -1,20 +1,84 @@ SEQ_NAME: test1 SEQ: attcccggggnnnnn SEQ_LEN: 15 -HOMOPOL_MAX: 5 +HOMOPOL_PAT: A +HOMOPOL_LEN: 1 +HOMOPOL_POS: 0 +--- +SEQ_NAME: test1 +SEQ: attcccggggnnnnn +SEQ_LEN: 15 +HOMOPOL_PAT: TT +HOMOPOL_LEN: 2 +HOMOPOL_POS: 1 +--- +SEQ_NAME: test1 +SEQ: attcccggggnnnnn +SEQ_LEN: 15 +HOMOPOL_PAT: CCC +HOMOPOL_LEN: 3 +HOMOPOL_POS: 3 +--- +SEQ_NAME: test1 +SEQ: attcccggggnnnnn +SEQ_LEN: 15 +HOMOPOL_PAT: GGGG +HOMOPOL_LEN: 4 +HOMOPOL_POS: 6 +--- +SEQ_NAME: test1 +SEQ: attcccggggnnnnn +SEQ_LEN: 15 +HOMOPOL_PAT: NNNNN +HOMOPOL_LEN: 5 +HOMOPOL_POS: 10 +--- +SEQ_NAME: test2 +SEQ: nnnnnggggccctta +SEQ_LEN: 15 +HOMOPOL_PAT: NNNNN +HOMOPOL_LEN: 5 +HOMOPOL_POS: 0 +--- +SEQ_NAME: test2 +SEQ: nnnnnggggccctta +SEQ_LEN: 15 +HOMOPOL_PAT: GGGG +HOMOPOL_LEN: 4 +HOMOPOL_POS: 5 +--- +SEQ_NAME: test2 +SEQ: nnnnnggggccctta +SEQ_LEN: 15 +HOMOPOL_PAT: CCC +HOMOPOL_LEN: 3 +HOMOPOL_POS: 9 +--- +SEQ_NAME: test2 +SEQ: nnnnnggggccctta +SEQ_LEN: 15 +HOMOPOL_PAT: TT +HOMOPOL_LEN: 2 +HOMOPOL_POS: 12 --- SEQ_NAME: test2 SEQ: nnnnnggggccctta SEQ_LEN: 15 -HOMOPOL_MAX: 5 +HOMOPOL_PAT: A +HOMOPOL_LEN: 1 +HOMOPOL_POS: 14 --- SEQ_NAME: test3 SEQ: a SEQ_LEN: 1 -HOMOPOL_MAX: 1 +HOMOPOL_PAT: A +HOMOPOL_LEN: 1 +HOMOPOL_POS: 0 --- SEQ_NAME: test4 SEQ: aA SEQ_LEN: 2 -HOMOPOL_MAX: 2 +HOMOPOL_PAT: AA +HOMOPOL_LEN: 2 +HOMOPOL_POS: 0 --- diff --git a/bp_test/out/find_homopolymers.out.2 b/bp_test/out/find_homopolymers.out.2 index 9213941..5eeb22c 100644 --- a/bp_test/out/find_homopolymers.out.2 +++ b/bp_test/out/find_homopolymers.out.2 @@ -1,20 +1,50 @@ SEQ_NAME: test1 SEQ: attcccggggnnnnn SEQ_LEN: 15 -HOMOPOL_MAX: 5 +HOMOPOL_PAT: CCC +HOMOPOL_LEN: 3 +HOMOPOL_POS: 3 +--- +SEQ_NAME: test1 +SEQ: attcccggggnnnnn +SEQ_LEN: 15 +HOMOPOL_PAT: GGGG +HOMOPOL_LEN: 4 +HOMOPOL_POS: 6 +--- +SEQ_NAME: test1 +SEQ: attcccggggnnnnn +SEQ_LEN: 15 +HOMOPOL_PAT: NNNNN +HOMOPOL_LEN: 5 +HOMOPOL_POS: 10 +--- +SEQ_NAME: test2 +SEQ: nnnnnggggccctta +SEQ_LEN: 15 +HOMOPOL_PAT: NNNNN +HOMOPOL_LEN: 5 +HOMOPOL_POS: 0 +--- +SEQ_NAME: test2 +SEQ: nnnnnggggccctta +SEQ_LEN: 15 +HOMOPOL_PAT: GGGG +HOMOPOL_LEN: 4 +HOMOPOL_POS: 5 --- SEQ_NAME: test2 SEQ: nnnnnggggccctta SEQ_LEN: 15 -HOMOPOL_MAX: 5 +HOMOPOL_PAT: CCC +HOMOPOL_LEN: 3 +HOMOPOL_POS: 9 --- SEQ_NAME: test3 SEQ: a SEQ_LEN: 1 -HOMOPOL_MAX: 0 --- SEQ_NAME: test4 SEQ: aA SEQ_LEN: 2 -HOMOPOL_MAX: 0 --- diff --git a/bp_test/test/test_find_homopolymers b/bp_test/test/test_find_homopolymers index 939c6f4..591a6d2 100755 --- a/bp_test/test/test_find_homopolymers +++ b/bp_test/test/test_find_homopolymers @@ -9,3 +9,11 @@ clean run "$bp -I $in -m 3 -O $tmp" assert_no_diff $tmp $out.2 clean + +run "$bp -I $in -m 3 -l 1 -O $tmp" +assert_no_diff $tmp $out.3 +clean + +run "$bp -I $in -m 3 -L -O $tmp" +assert_no_diff $tmp $out.4 +clean diff --git a/code_ruby/lib/maasha/seq.rb b/code_ruby/lib/maasha/seq.rb index d8a73d3..2788e5d 100644 --- a/code_ruby/lib/maasha/seq.rb +++ b/code_ruby/lib/maasha/seq.rb @@ -27,8 +27,9 @@ require 'maasha/seq/digest' require 'maasha/seq/trim' require 'narray' -autoload :BackTrack, 'maasha/seq/backtrack.rb' -autoload :Dynamic, 'maasha/seq/dynamic.rb' +autoload :BackTrack, 'maasha/seq/backtrack.rb' +autoload :Dynamic, 'maasha/seq/dynamic.rb' +autoload :Homopolymer, 'maasha/seq/homopolymer.rb' # Residue alphabets DNA = %w[a t c g] @@ -515,23 +516,6 @@ class Seq comp end - # Method that returns the length of the longest homopolymeric stretch - # found in a sequence. - def homopol_max(min = 1) - return 0 if self.seq.nil? or self.seq.empty? - - found = false - - self.seq.upcase.scan(/A{#{min},}|T{#{min},}|G{#{min},}|C{#{min},}|N{#{min},}/) do |match| - found = true - min = match.size > min ? match.size : min - end - - return 0 unless found - - min - end - # Method that returns the percentage of hard masked residues # or N's in a sequence. def hard_mask diff --git a/code_ruby/test/maasha/test_seq.rb b/code_ruby/test/maasha/test_seq.rb index 0c84720..8eed8d5 100755 --- a/code_ruby/test/maasha/test_seq.rb +++ b/code_ruby/test/maasha/test_seq.rb @@ -546,26 +546,6 @@ class TestSeq < Test::Unit::TestCase assert_equal(0, @entry.composition["X"]) end - test "#homopol_max returns 0 with empty sequence" do - @entry.seq = "" - assert_equal(0, @entry.homopol_max) - end - - test "#homopol_max returns 0 with nil sequence" do - @entry.seq = nil - assert_equal(0, @entry.homopol_max) - end - - test "#homopol_max returns 0 when not found" do - @entry.seq = "AtTcCcGggGnnNnn" - assert_equal(0, @entry.homopol_max(6)) - end - - test "#homopol_max returns correctly" do - @entry.seq = "AtTcCcGggGnnNnn" - assert_equal(5, @entry.homopol_max(3)) - end - test "#hard_mask returns correctly" do @entry.seq = "--AAAANn" assert_equal(33.33, @entry.hard_mask) -- 2.39.2