From: martinahansen Date: Tue, 12 Mar 2013 18:12:29 +0000 (+0000) Subject: added seq/assemble.rb X-Git-Url: https://git.donarmstrong.com/?p=biopieces.git;a=commitdiff_plain;h=232906e4a88ad2cc9fbe90d12ca6e6553d2badf7 added seq/assemble.rb git-svn-id: http://biopieces.googlecode.com/svn/trunk@2133 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/code_ruby/lib/maasha/seq/assemble.rb b/code_ruby/lib/maasha/seq/assemble.rb new file mode 100644 index 0000000..e59e178 --- /dev/null +++ b/code_ruby/lib/maasha/seq/assemble.rb @@ -0,0 +1,80 @@ +# Copyright (C) 2007-2013 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This software is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +class Assemble + def self.pair(entry1, entry2, options = {}) + assemble = self.new(entry1, entry2, options) + assemble.match + end + + def initialize(entry1, entry2, options) + @entry1 = entry1 + @entry2 = entry2 + @options = options + @options[:mismatches_max] ||= 0 + @options[:overlap_min] ||= 1 + @options[:overlap_max] ||= entry1.length + @options[:overlap_max] = [@options[:overlap_max], entry1.length, entry2.length].min + end + + def match + overlap = @options[:overlap_max] + + na_seq1 = NArray.to_na(@entry1.seq.downcase, "byte") + na_seq2 = NArray.to_na(@entry2.seq.downcase, "byte") + + while overlap >= @options[:overlap_min] + hamming_dist = (na_seq1[-1 * overlap .. -1] ^ na_seq2[0 ... overlap]).count_true + + if hamming_dist <= percent2real(overlap, @options[:mismatches_max]) + merged = @entry1 + @entry2[overlap .. -1] + + if @entry1.qual and @entry2.qual + qual1 = @entry1.qual[@entry1.length - overlap .. -1] + qual2 = @entry2.qual[0 ... overlap] + + na_qual1 = NArray.to_na(qual1, "byte") + na_qual2 = NArray.to_na(qual2, "byte") + + qual = ((na_qual1 + na_qual2) / 2).to_s + + merged.seq_name = @entry1.seq_name + ":overlap=#{overlap}:hamming=#{hamming_dist}" + merged.qual[@entry1.length - overlap ... @entry1.length] = qual + end + + return merged + end + + overlap -= 1 + end + end + + def percent2real(length, percent) + (length * percent * 0.01).round + end +end + + +__END__ + diff --git a/code_ruby/test/maasha/seq/test_assemble.rb b/code_ruby/test/maasha/seq/test_assemble.rb new file mode 100755 index 0000000..bdc759c --- /dev/null +++ b/code_ruby/test/maasha/seq/test_assemble.rb @@ -0,0 +1,36 @@ +#!/usr/bin/env ruby +$:.unshift File.join(File.dirname(__FILE__), '..', '..', '..') + +require 'maasha/seq/assemble' +require 'test/unit' +require 'test/helper' +require 'pp' + +class TestAssemble < Test::Unit::TestCase + test "Assemble.pair without overlap returns nil" do + assert_nil(Assemble.pair(Seq.new("test1", "aaaa"), Seq.new("test2", "tttt"))) + end + + test "Assemble.pair with overlap returns the correctly" do + assert_equal("atcgatc", Assemble.pair(Seq.new("test1", "atcg"), Seq.new("test2", "gatc")).seq) + assert_equal("atcgat", Assemble.pair(Seq.new("test1", "atcg"), Seq.new("test2", "cgat")).seq) + assert_equal("atcga", Assemble.pair(Seq.new("test1", "atcg"), Seq.new("test2", "tcga")).seq) + assert_equal("atcg", Assemble.pair(Seq.new("test1", "atcg"), Seq.new("test2", "atcg")).seq) + end + + test "Assemble.pair with overlap and overlap_min returns correctly" do + assert_nil(Assemble.pair(Seq.new("test1", "atcg"), Seq.new("test2", "gatc"), :overlap_min => 2)) + assert_equal("atcgat", Assemble.pair(Seq.new("test1", "atcg"), Seq.new("test2", "cgat"), :overlap_min => 2).seq) + end + + test "Assemble.pair with overlap and overlap_max returns correctly" do + assert_equal("atcga", Assemble.pair(Seq.new("test1", "atcg"), Seq.new("test2", "tcga"), :overlap_max => 3).seq) + assert_nil(Assemble.pair(Seq.new("test1", "atcg"), Seq.new("test2", "atcg"), :overlap_max => 3)) + end + + test "Assemble.pair with overlap returns correct quality" do + assert_equal("?", Assemble.pair(Seq.new("test1", "a", :dna, "I"), Seq.new("test2", "a", :dna, "5")).qual) + assert_equal("GH??43", Assemble.pair(Seq.new("test1", "atcg", :dna, "GHII"), Seq.new("test2", "cgat", :dna, "5543")).qual) + assert_equal("I???5", Assemble.pair(Seq.new("test1", "atcg", :dna, "IIII"), Seq.new("test2", "tcga", :dna, "5555")).qual) + end +end diff --git a/code_ruby/test/maasha/test_seq.rb b/code_ruby/test/maasha/test_seq.rb index a6a5cd7..0c84720 100755 --- a/code_ruby/test/maasha/test_seq.rb +++ b/code_ruby/test/maasha/test_seq.rb @@ -333,6 +333,22 @@ class TestSeq < Test::Unit::TestCase assert_not_equal(@entry.seq, @entry.shuffle!.seq) end + test "#+ without qual returns correctly" do + entry = Seq.new("test1", "at") + Seq.new("test2", "cg") + assert_nil(entry.seq_name) + assert_equal("atcg", entry.seq) + assert_nil(entry.type) + assert_nil(entry.qual) + end + + test "#+ with qual returns correctly" do + entry = Seq.new("test1", "at", :dna, "II") + Seq.new("test2", "cg", :dna, "JJ") + assert_nil(entry.seq_name) + assert_equal("atcg", entry.seq) + assert_equal(:dna, entry.type) + assert_equal("IIJJ", entry.qual) + end + test "#<< with different types raises" do @entry.seq = "atcg" assert_raise(SeqError) { @entry << Seq.new("test", "atcg", :dna) }