]> git.donarmstrong.com Git - biopieces.git/commitdiff
added seq/assemble.rb
authormartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Tue, 12 Mar 2013 18:12:29 +0000 (18:12 +0000)
committermartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Tue, 12 Mar 2013 18:12:29 +0000 (18:12 +0000)
git-svn-id: http://biopieces.googlecode.com/svn/trunk@2133 74ccb610-7750-0410-82ae-013aeee3265d

code_ruby/lib/maasha/seq/assemble.rb [new file with mode: 0644]
code_ruby/test/maasha/seq/test_assemble.rb [new file with mode: 0755]
code_ruby/test/maasha/test_seq.rb

diff --git a/code_ruby/lib/maasha/seq/assemble.rb b/code_ruby/lib/maasha/seq/assemble.rb
new file mode 100644 (file)
index 0000000..e59e178
--- /dev/null
@@ -0,0 +1,80 @@
+# Copyright (C) 2007-2013 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# This software is part of the Biopieces framework (www.biopieces.org).
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+class Assemble
+  def self.pair(entry1, entry2, options = {})
+    assemble = self.new(entry1, entry2, options)
+    assemble.match
+  end
+
+  def initialize(entry1, entry2, options)
+    @entry1  = entry1
+    @entry2  = entry2
+    @options = options
+    @options[:mismatches_max] ||= 0
+    @options[:overlap_min]    ||= 1
+    @options[:overlap_max]    ||= entry1.length
+    @options[:overlap_max]      = [@options[:overlap_max], entry1.length, entry2.length].min
+  end
+
+  def match
+    overlap = @options[:overlap_max]
+
+    na_seq1 = NArray.to_na(@entry1.seq.downcase, "byte")
+    na_seq2 = NArray.to_na(@entry2.seq.downcase, "byte")
+
+    while overlap >= @options[:overlap_min]
+      hamming_dist = (na_seq1[-1 * overlap .. -1] ^ na_seq2[0 ... overlap]).count_true
+
+      if hamming_dist <= percent2real(overlap, @options[:mismatches_max])
+        merged = @entry1 + @entry2[overlap .. -1]
+
+        if @entry1.qual and @entry2.qual
+          qual1 = @entry1.qual[@entry1.length - overlap .. -1]
+          qual2 = @entry2.qual[0 ... overlap]
+
+          na_qual1 = NArray.to_na(qual1, "byte")
+          na_qual2 = NArray.to_na(qual2, "byte")
+
+          qual = ((na_qual1 + na_qual2) / 2).to_s
+
+          merged.seq_name = @entry1.seq_name + ":overlap=#{overlap}:hamming=#{hamming_dist}"
+          merged.qual[@entry1.length - overlap ... @entry1.length]  = qual
+        end
+
+        return merged
+      end
+
+      overlap -= 1
+    end
+  end
+
+  def percent2real(length, percent)
+      (length * percent * 0.01).round
+  end
+end
+
+
+__END__
+
diff --git a/code_ruby/test/maasha/seq/test_assemble.rb b/code_ruby/test/maasha/seq/test_assemble.rb
new file mode 100755 (executable)
index 0000000..bdc759c
--- /dev/null
@@ -0,0 +1,36 @@
+#!/usr/bin/env ruby
+$:.unshift File.join(File.dirname(__FILE__), '..', '..', '..')
+
+require 'maasha/seq/assemble'
+require 'test/unit'
+require 'test/helper'
+require 'pp'
+
+class TestAssemble < Test::Unit::TestCase 
+  test "Assemble.pair without overlap returns nil" do
+    assert_nil(Assemble.pair(Seq.new("test1", "aaaa"), Seq.new("test2", "tttt")))
+  end
+
+  test "Assemble.pair with overlap returns the correctly" do
+    assert_equal("atcgatc", Assemble.pair(Seq.new("test1", "atcg"), Seq.new("test2", "gatc")).seq)
+    assert_equal("atcgat",  Assemble.pair(Seq.new("test1", "atcg"), Seq.new("test2", "cgat")).seq)
+    assert_equal("atcga",   Assemble.pair(Seq.new("test1", "atcg"), Seq.new("test2", "tcga")).seq)
+    assert_equal("atcg",    Assemble.pair(Seq.new("test1", "atcg"), Seq.new("test2", "atcg")).seq)
+  end
+
+  test "Assemble.pair with overlap and overlap_min returns correctly" do
+    assert_nil(Assemble.pair(Seq.new("test1", "atcg"), Seq.new("test2", "gatc"), :overlap_min => 2))
+    assert_equal("atcgat", Assemble.pair(Seq.new("test1", "atcg"), Seq.new("test2", "cgat"), :overlap_min => 2).seq)
+  end
+
+  test "Assemble.pair with overlap and overlap_max returns correctly" do
+    assert_equal("atcga", Assemble.pair(Seq.new("test1", "atcg"), Seq.new("test2", "tcga"), :overlap_max => 3).seq)
+    assert_nil(Assemble.pair(Seq.new("test1", "atcg"), Seq.new("test2", "atcg"), :overlap_max => 3))
+  end
+
+  test "Assemble.pair with overlap returns correct quality" do
+    assert_equal("?", Assemble.pair(Seq.new("test1", "a", :dna, "I"), Seq.new("test2", "a", :dna, "5")).qual)
+    assert_equal("GH??43", Assemble.pair(Seq.new("test1", "atcg", :dna, "GHII"), Seq.new("test2", "cgat", :dna, "5543")).qual)
+    assert_equal("I???5", Assemble.pair(Seq.new("test1", "atcg", :dna, "IIII"), Seq.new("test2", "tcga", :dna, "5555")).qual)
+  end
+end
index a6a5cd7f8a0d482561b1445cae4a8857cea80700..0c847205cb6bbd06ec7df4cbb6602a654cd22cf0 100755 (executable)
@@ -333,6 +333,22 @@ class TestSeq < Test::Unit::TestCase
     assert_not_equal(@entry.seq, @entry.shuffle!.seq)
   end
 
+  test "#+ without qual returns correctly" do
+    entry = Seq.new("test1", "at") + Seq.new("test2", "cg")
+    assert_nil(entry.seq_name)
+    assert_equal("atcg", entry.seq)
+    assert_nil(entry.type)
+    assert_nil(entry.qual)
+  end
+
+  test "#+ with qual returns correctly" do
+    entry = Seq.new("test1", "at", :dna, "II") + Seq.new("test2", "cg", :dna, "JJ")
+    assert_nil(entry.seq_name)
+    assert_equal("atcg", entry.seq)
+    assert_equal(:dna,   entry.type)
+    assert_equal("IIJJ", entry.qual)
+  end
+
   test "#<< with different types raises" do
     @entry.seq = "atcg"
     assert_raise(SeqError) { @entry << Seq.new("test", "atcg", :dna) }