hamming_dist = (na_seq1[-1 * overlap .. -1] ^ na_seq2[0 ... overlap]).count_true
if hamming_dist <= percent2real(overlap, @options[:mismatches_max])
- merged = @entry1 + @entry2[overlap .. -1]
+ entry_left = @entry1[0 ... @entry1.length - overlap]
+ entry_right = @entry2[overlap .. -1]
if @entry1.qual and @entry2.qual
- qual1 = @entry1.qual[@entry1.length - overlap .. -1]
- qual2 = @entry2.qual[0 ... overlap]
+ entry_overlap1 = @entry1[-1 * overlap .. -1]
+ entry_overlap2 = @entry2[0 ... overlap]
- na_qual1 = NArray.to_na(qual1, "byte")
- na_qual2 = NArray.to_na(qual2, "byte")
-
- qual = ((na_qual1 + na_qual2) / 2).to_s
-
- merged.seq_name = @entry1.seq_name + ":overlap=#{overlap}:hamming=#{hamming_dist}"
- merged.qual[@entry1.length - overlap ... @entry1.length] = qual
+ entry_overlap = merge_overlap(entry_overlap1, entry_overlap2)
+ else
+ entry_overlap = @entry1[-1 * overlap .. -1]
end
- return merged
+ entry_merged = entry_left + entry_overlap + entry_right
+ entry_merged.seq_name = @entry1.seq_name + ":overlap=#{overlap}:hamming=#{hamming_dist}"
+
+ return entry_merged
end
overlap -= 1
end
def percent2real(length, percent)
- (length * percent * 0.01).round
+ (length * percent * 0.01).round
+ end
+
+ def merge_overlap(entry_overlap1, entry_overlap2)
+ na_seq = NArray.byte(entry_overlap1.length, 2)
+ na_seq[true, 0] = NArray.to_na(entry_overlap1.seq.downcase, "byte")
+ na_seq[true, 1] = NArray.to_na(entry_overlap2.seq.downcase, "byte")
+
+ na_qual = NArray.byte(entry_overlap1.length, 2)
+ na_qual[true, 0] = NArray.to_na(entry_overlap1.qual, "byte")
+ na_qual[true, 1] = NArray.to_na(entry_overlap2.qual, "byte")
+
+ mask_xor = na_seq[true, 0] ^ na_seq[true, 1] > 0
+ mask_seq = ((na_qual * mask_xor).eq( (na_qual * mask_xor).max(1)))
+
+ merged = Seq.new()
+ merged.seq = (na_seq * mask_seq).max(1).to_s
+ merged.qual = na_qual.mean(1).round.to_type("byte").to_s
+
+ merged
end
end
assert_equal("GH??43", Assemble.pair(Seq.new("test1", "atcg", :dna, "GHII"), Seq.new("test2", "cgat", :dna, "5543")).qual)
assert_equal("I???5", Assemble.pair(Seq.new("test1", "atcg", :dna, "IIII"), Seq.new("test2", "tcga", :dna, "5555")).qual)
end
+
+ test "Assemble.pair with mismatch returns the highest scoring" do
+ assert_equal("atcga", Assemble.pair(Seq.new("t1", "atcga", :dna, "IIIII"), Seq.new("t2", "attga", :dna, "55555"), :mismatches_max => 20).seq)
+ assert_equal("attga", Assemble.pair(Seq.new("t1", "atcga", :dna, "55555"), Seq.new("t2", "attga", :dna, "IIIII"), :mismatches_max => 20).seq)
+ end
end