# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+require 'inline'
+require 'maasha/seq/ambiguity'
+
+# Class containing methods to assemble two overlapping sequences into a single.
class Assemble
+ extend Ambiguity
+
+ # Class method to assemble two Seq objects.
def self.pair(entry1, entry2, options = {})
assemble = self.new(entry1, entry2, options)
assemble.match
end
+ # Method to initialize an Assembly object.
def initialize(entry1, entry2, options)
@entry1 = entry1
@entry2 = entry2
@options = options
@options[:mismatches_max] ||= 0
@options[:overlap_min] ||= 1
- @options[:overlap_max] ||= entry1.length
- @options[:overlap_max] = [@options[:overlap_max], entry1.length, entry2.length].min
end
+ # Method to locate overlapping matches between two sequences.
def match
- overlap = @options[:overlap_max]
-
- na_seq1 = NArray.to_na(@entry1.seq.downcase, "byte")
- na_seq2 = NArray.to_na(@entry2.seq.downcase, "byte")
+ if @options[:overlap_max]
+ overlap = [@options[:overlap_max], @entry1.length, @entry2.length].min
+ else
+ overlap = [@entry1.length, @entry2.length].min
+ end
while overlap >= @options[:overlap_min]
- hamming_dist = (na_seq1[-1 * overlap .. -1] ^ na_seq2[0 ... overlap]).count_true
+ mismatches_max = (overlap * @options[:mismatches_max] * 0.01).round
+
+ offset1 = @entry2.length - overlap
+ offset2 = 0
- if hamming_dist <= percent2real(overlap, @options[:mismatches_max])
+ if match_C(@entry1.seq, @entry2.seq, offset1, offset2, overlap, mismatches_max)
entry_left = @entry1[0 ... @entry1.length - overlap]
entry_right = @entry2[overlap .. -1]
entry_overlap = @entry1[-1 * overlap .. -1]
end
+ entry_left.seq.downcase!
+ entry_overlap.seq.upcase!
+ entry_right.seq.downcase!
entry_merged = entry_left + entry_overlap + entry_right
- entry_merged.seq_name = @entry1.seq_name + ":overlap=#{overlap}:hamming=#{hamming_dist}"
+ entry_merged.seq_name = @entry1.seq_name + ":overlap=#{overlap}"
return entry_merged
end
end
end
- def percent2real(length, percent)
- (length * percent * 0.01).round
- end
-
+ # Method to merge sequence and quality scores in an overlap.
+ # The residue with the highest score at mismatch positions is selected.
+ # The quality scores of the overlap are the mean of the two sequences.
def merge_overlap(entry_overlap1, entry_overlap2)
na_seq = NArray.byte(entry_overlap1.length, 2)
na_seq[true, 0] = NArray.to_na(entry_overlap1.seq.downcase, "byte")
merged
end
+
+ inline do |builder|
+ add_ambiguity_macro(builder)
+
+ # C method for determining if two strings of equal length match
+ # given a maximum allowed mismatches and allowing for IUPAC
+ # ambiguity codes. Returns true if match, else false.
+ builder.c %{
+ VALUE match_C(
+ VALUE _string1, // String 1
+ VALUE _string2, // String 2
+ VALUE _offset1, // Offset 1
+ VALUE _offset2, // Offset 2
+ VALUE _length, // String length
+ VALUE _max_mismatch // Maximum mismatches
+ )
+ {
+ char *string1 = StringValuePtr(_string1);
+ char *string2 = StringValuePtr(_string2);
+ unsigned int offset1 = FIX2UINT(_offset1);
+ unsigned int offset2 = FIX2UINT(_offset2);
+ unsigned int length = FIX2UINT(_length);
+ unsigned int max_mismatch = FIX2UINT(_max_mismatch);
+
+ unsigned int max_match = length - max_mismatch;
+ unsigned int match = 0;
+ unsigned int mismatch = 0;
+ unsigned int i = 0;
+
+ for (i = 0; i < length; i++)
+ {
+ if (MATCH(string1[i + offset1], string2[i + offset2]))
+ {
+ match++;
+
+ if (match >= max_match) {
+ return Qtrue;
+ }
+ }
+ else
+ {
+ mismatch++;
+
+ if (mismatch > max_mismatch) {
+ return Qfalse;
+ }
+ }
+ }
+
+ return Qfalse;
+ }
+ }
+ end
end