]> git.donarmstrong.com Git - biopieces.git/commitdiff
upgraded assemble.rb to chose proper residue when mismatch
authormartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Wed, 13 Mar 2013 09:12:52 +0000 (09:12 +0000)
committermartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Wed, 13 Mar 2013 09:12:52 +0000 (09:12 +0000)
git-svn-id: http://biopieces.googlecode.com/svn/trunk@2137 74ccb610-7750-0410-82ae-013aeee3265d

code_ruby/lib/maasha/seq/assemble.rb
code_ruby/test/maasha/seq/test_assemble.rb

index e59e178b73df8ba45689836db02c852d5632f637..7745b5bed7141bd1ad4d500c71b9ac585b2ed0b2 100644 (file)
@@ -48,22 +48,22 @@ class Assemble
       hamming_dist = (na_seq1[-1 * overlap .. -1] ^ na_seq2[0 ... overlap]).count_true
 
       if hamming_dist <= percent2real(overlap, @options[:mismatches_max])
-        merged = @entry1 + @entry2[overlap .. -1]
+        entry_left  = @entry1[0 ... @entry1.length - overlap]
+        entry_right = @entry2[overlap .. -1]
 
         if @entry1.qual and @entry2.qual
-          qual1 = @entry1.qual[@entry1.length - overlap .. -1]
-          qual2 = @entry2.qual[0 ... overlap]
+          entry_overlap1 = @entry1[-1 * overlap .. -1]
+          entry_overlap2 = @entry2[0 ... overlap]
 
-          na_qual1 = NArray.to_na(qual1, "byte")
-          na_qual2 = NArray.to_na(qual2, "byte")
-
-          qual = ((na_qual1 + na_qual2) / 2).to_s
-
-          merged.seq_name = @entry1.seq_name + ":overlap=#{overlap}:hamming=#{hamming_dist}"
-          merged.qual[@entry1.length - overlap ... @entry1.length]  = qual
+          entry_overlap = merge_overlap(entry_overlap1, entry_overlap2)
+        else
+          entry_overlap = @entry1[-1 * overlap .. -1]
         end
 
-        return merged
+        entry_merged          = entry_left + entry_overlap + entry_right
+        entry_merged.seq_name = @entry1.seq_name + ":overlap=#{overlap}:hamming=#{hamming_dist}"
+
+        return entry_merged
       end
 
       overlap -= 1
@@ -71,7 +71,26 @@ class Assemble
   end
 
   def percent2real(length, percent)
-      (length * percent * 0.01).round
+    (length * percent * 0.01).round
+  end
+
+  def merge_overlap(entry_overlap1, entry_overlap2)
+    na_seq = NArray.byte(entry_overlap1.length, 2)
+    na_seq[true, 0] = NArray.to_na(entry_overlap1.seq.downcase, "byte")
+    na_seq[true, 1] = NArray.to_na(entry_overlap2.seq.downcase, "byte")
+
+    na_qual = NArray.byte(entry_overlap1.length, 2)
+    na_qual[true, 0] = NArray.to_na(entry_overlap1.qual, "byte")
+    na_qual[true, 1] = NArray.to_na(entry_overlap2.qual, "byte")
+
+    mask_xor = na_seq[true, 0] ^ na_seq[true, 1] > 0
+    mask_seq = ((na_qual * mask_xor).eq( (na_qual * mask_xor).max(1)))
+
+    merged      = Seq.new()
+    merged.seq  = (na_seq * mask_seq).max(1).to_s
+    merged.qual = na_qual.mean(1).round.to_type("byte").to_s
+
+    merged
   end
 end
 
index bdc759c3297a269493509a21419a27a249a29d32..cf05ca839fca4fb8bcdb0521b0d62b4e9eea6b83 100755 (executable)
@@ -33,4 +33,9 @@ class TestAssemble < Test::Unit::TestCase
     assert_equal("GH??43", Assemble.pair(Seq.new("test1", "atcg", :dna, "GHII"), Seq.new("test2", "cgat", :dna, "5543")).qual)
     assert_equal("I???5", Assemble.pair(Seq.new("test1", "atcg", :dna, "IIII"), Seq.new("test2", "tcga", :dna, "5555")).qual)
   end
+
+  test "Assemble.pair with mismatch returns the highest scoring" do
+    assert_equal("atcga", Assemble.pair(Seq.new("t1", "atcga", :dna, "IIIII"), Seq.new("t2", "attga", :dna, "55555"), :mismatches_max => 20).seq)
+    assert_equal("attga", Assemble.pair(Seq.new("t1", "atcga", :dna, "55555"), Seq.new("t2", "attga", :dna, "IIIII"), :mismatches_max => 20).seq)
+  end
 end