From: martinahansen Date: Wed, 13 Mar 2013 09:12:52 +0000 (+0000) Subject: upgraded assemble.rb to chose proper residue when mismatch X-Git-Url: https://git.donarmstrong.com/?p=biopieces.git;a=commitdiff_plain;h=8baaff296bcb2be5030906d02a36e5a27cb9d064 upgraded assemble.rb to chose proper residue when mismatch git-svn-id: http://biopieces.googlecode.com/svn/trunk@2137 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/code_ruby/lib/maasha/seq/assemble.rb b/code_ruby/lib/maasha/seq/assemble.rb index e59e178..7745b5b 100644 --- a/code_ruby/lib/maasha/seq/assemble.rb +++ b/code_ruby/lib/maasha/seq/assemble.rb @@ -48,22 +48,22 @@ class Assemble hamming_dist = (na_seq1[-1 * overlap .. -1] ^ na_seq2[0 ... overlap]).count_true if hamming_dist <= percent2real(overlap, @options[:mismatches_max]) - merged = @entry1 + @entry2[overlap .. -1] + entry_left = @entry1[0 ... @entry1.length - overlap] + entry_right = @entry2[overlap .. -1] if @entry1.qual and @entry2.qual - qual1 = @entry1.qual[@entry1.length - overlap .. -1] - qual2 = @entry2.qual[0 ... overlap] + entry_overlap1 = @entry1[-1 * overlap .. -1] + entry_overlap2 = @entry2[0 ... overlap] - na_qual1 = NArray.to_na(qual1, "byte") - na_qual2 = NArray.to_na(qual2, "byte") - - qual = ((na_qual1 + na_qual2) / 2).to_s - - merged.seq_name = @entry1.seq_name + ":overlap=#{overlap}:hamming=#{hamming_dist}" - merged.qual[@entry1.length - overlap ... @entry1.length] = qual + entry_overlap = merge_overlap(entry_overlap1, entry_overlap2) + else + entry_overlap = @entry1[-1 * overlap .. -1] end - return merged + entry_merged = entry_left + entry_overlap + entry_right + entry_merged.seq_name = @entry1.seq_name + ":overlap=#{overlap}:hamming=#{hamming_dist}" + + return entry_merged end overlap -= 1 @@ -71,7 +71,26 @@ class Assemble end def percent2real(length, percent) - (length * percent * 0.01).round + (length * percent * 0.01).round + end + + def merge_overlap(entry_overlap1, entry_overlap2) + na_seq = NArray.byte(entry_overlap1.length, 2) + na_seq[true, 0] = NArray.to_na(entry_overlap1.seq.downcase, "byte") + na_seq[true, 1] = NArray.to_na(entry_overlap2.seq.downcase, "byte") + + na_qual = NArray.byte(entry_overlap1.length, 2) + na_qual[true, 0] = NArray.to_na(entry_overlap1.qual, "byte") + na_qual[true, 1] = NArray.to_na(entry_overlap2.qual, "byte") + + mask_xor = na_seq[true, 0] ^ na_seq[true, 1] > 0 + mask_seq = ((na_qual * mask_xor).eq( (na_qual * mask_xor).max(1))) + + merged = Seq.new() + merged.seq = (na_seq * mask_seq).max(1).to_s + merged.qual = na_qual.mean(1).round.to_type("byte").to_s + + merged end end diff --git a/code_ruby/test/maasha/seq/test_assemble.rb b/code_ruby/test/maasha/seq/test_assemble.rb index bdc759c..cf05ca8 100755 --- a/code_ruby/test/maasha/seq/test_assemble.rb +++ b/code_ruby/test/maasha/seq/test_assemble.rb @@ -33,4 +33,9 @@ class TestAssemble < Test::Unit::TestCase assert_equal("GH??43", Assemble.pair(Seq.new("test1", "atcg", :dna, "GHII"), Seq.new("test2", "cgat", :dna, "5543")).qual) assert_equal("I???5", Assemble.pair(Seq.new("test1", "atcg", :dna, "IIII"), Seq.new("test2", "tcga", :dna, "5555")).qual) end + + test "Assemble.pair with mismatch returns the highest scoring" do + assert_equal("atcga", Assemble.pair(Seq.new("t1", "atcga", :dna, "IIIII"), Seq.new("t2", "attga", :dna, "55555"), :mismatches_max => 20).seq) + assert_equal("attga", Assemble.pair(Seq.new("t1", "atcga", :dna, "55555"), Seq.new("t2", "attga", :dna, "IIIII"), :mismatches_max => 20).seq) + end end