X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=code_ruby%2Flib%2Fmaasha%2Fseq%2Fassemble.rb;h=bab869c98b580a52cb1612b9d21b4add8760de10;hb=5526cf56fc20602b9be197db8b1a9a44502791ab;hp=dffbf11ee43f25b1fcf684d72c86d7a95ead62f4;hpb=ee14fa4dfd3f52f487a34e5c43ef6bbaacdcd1cb;p=biopieces.git diff --git a/code_ruby/lib/maasha/seq/assemble.rb b/code_ruby/lib/maasha/seq/assemble.rb index dffbf11..bab869c 100644 --- a/code_ruby/lib/maasha/seq/assemble.rb +++ b/code_ruby/lib/maasha/seq/assemble.rb @@ -22,8 +22,13 @@ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +require 'inline' +require 'maasha/seq/ambiguity' + # Class containing methods to assemble two overlapping sequences into a single. class Assemble + extend Ambiguity + # Class method to assemble two Seq objects. def self.pair(entry1, entry2, options = {}) assemble = self.new(entry1, entry2, options) @@ -34,44 +39,81 @@ class Assemble def initialize(entry1, entry2, options) @entry1 = entry1 @entry2 = entry2 + @overlap = 0 + @offset1 = 0 + @offset2 = 0 @options = options @options[:mismatches_max] ||= 0 @options[:overlap_min] ||= 1 - @options[:overlap_max] ||= entry1.length - @options[:overlap_max] = [@options[:overlap_max], entry1.length, entry2.length].min end - # Method to locate overlapping matche between two sequences. + # Method to locate overlapping matches between two sequences. def match - overlap = @options[:overlap_max] - - na_seq1 = NArray.to_na(@entry1.seq.downcase, "byte") - na_seq2 = NArray.to_na(@entry2.seq.downcase, "byte") - - while overlap >= @options[:overlap_min] - hamming_dist = (na_seq1[-1 * overlap .. -1] ^ na_seq2[0 ... overlap]).count_true + if @options[:overlap_max] + @overlap = [@options[:overlap_max], @entry1.length, @entry2.length].min + else + @overlap = [@entry1.length, @entry2.length].min + end - if hamming_dist <= (overlap * @options[:mismatches_max] * 0.01).round - entry_left = @entry1[0 ... @entry1.length - overlap] - entry_right = @entry2[overlap .. -1] + diff = @entry1.length - @entry2.length + diff = 0 if diff < 0 - if @entry1.qual and @entry2.qual - entry_overlap1 = @entry1[-1 * overlap .. -1] - entry_overlap2 = @entry2[0 ... overlap] + @offset1 = @entry1.length - @overlap - diff - entry_overlap = merge_overlap(entry_overlap1, entry_overlap2) - else - entry_overlap = @entry1[-1 * overlap .. -1] - end + while @overlap >= @options[:overlap_min] + mismatches_max = (@overlap * @options[:mismatches_max] * 0.01).round + + # puts "diff: #{diff} offset1: #{@offset1} offset2: #{@offset2} overlap: #{@overlap}" + if mismatches = match_C(@entry1.seq, @entry2.seq, @offset1, @offset2, @overlap, mismatches_max) and mismatches >= 0 entry_merged = entry_left + entry_overlap + entry_right - entry_merged.seq_name = @entry1.seq_name + ":overlap=#{overlap}:hamming=#{hamming_dist}" + entry_merged.seq_name = @entry1.seq_name + ":overlap=#{@overlap}:hamming=#{mismatches}" return entry_merged end - overlap -= 1 + if diff > 0 + diff -= 1 + else + @overlap -= 1 + end + + @offset1 += 1 + end + end + + # Method to extract and downcase the left part of an assembled pair. + def entry_left + entry = @entry1[0 ... @offset1] + entry.seq.downcase! + entry + end + + # Method to extract and downcase the right part of an assembled pair. + def entry_right + if @entry1.length > @offset1 + @overlap + entry = @entry1[@offset1 + @overlap .. -1] + else + entry = @entry2[@offset2 + @overlap .. -1] + end + + entry.seq.downcase! + entry + end + + # Method to extract and upcase the overlapping part of an assembled pair. + def entry_overlap + if @entry1.qual and @entry2.qual + entry_overlap1 = @entry1[@offset1 ... @offset1 + @overlap] + entry_overlap2 = @entry2[@offset2 ... @offset2 + @overlap] + + entry = merge_overlap(entry_overlap1, entry_overlap2) + else + entry = @entry1[@offset1 ... @offset1 + @overlap] end + + entry.seq.upcase! + entry end # Method to merge sequence and quality scores in an overlap. @@ -95,6 +137,59 @@ class Assemble merged end + + inline do |builder| + add_ambiguity_macro(builder) + + # C method for determining if two strings of equal length match + # given a maximum allowed mismatches and allowing for IUPAC + # ambiguity codes. Returns number of mismatches is true if match, else false. + builder.c %{ + VALUE match_C( + VALUE _string1, // String 1 + VALUE _string2, // String 2 + VALUE _offset1, // Offset 1 + VALUE _offset2, // Offset 2 + VALUE _length, // String length + VALUE _max_mismatch // Maximum mismatches + ) + { + char *string1 = StringValuePtr(_string1); + char *string2 = StringValuePtr(_string2); + unsigned int offset1 = FIX2UINT(_offset1); + unsigned int offset2 = FIX2UINT(_offset2); + unsigned int length = FIX2UINT(_length); + unsigned int max_mismatch = FIX2UINT(_max_mismatch); + + unsigned int max_match = length - max_mismatch; + unsigned int match = 0; + unsigned int mismatch = 0; + unsigned int i = 0; + + for (i = 0; i < length; i++) + { + if (MATCH(string1[i + offset1], string2[i + offset2])) + { + match++; + + if (match >= max_match) { + return UINT2NUM(mismatch); + } + } + else + { + mismatch++; + + if (mismatch > max_mismatch) { + return INT2NUM(-1); + } + } + } + + return INT2NUM(-1); + } + } + end end