X-Git-Url: https://git.donarmstrong.com/?p=biopieces.git;a=blobdiff_plain;f=code_ruby%2Flib%2Fmaasha%2Fseq%2Fassemble.rb;h=bab869c98b580a52cb1612b9d21b4add8760de10;hp=c6f528b8ce9800061cd5aabdfada95b1911a5c11;hb=5526cf56fc20602b9be197db8b1a9a44502791ab;hpb=84a109e90b4b51959d0566e07648d350b89730cc diff --git a/code_ruby/lib/maasha/seq/assemble.rb b/code_ruby/lib/maasha/seq/assemble.rb index c6f528b..bab869c 100644 --- a/code_ruby/lib/maasha/seq/assemble.rb +++ b/code_ruby/lib/maasha/seq/assemble.rb @@ -39,6 +39,9 @@ class Assemble def initialize(entry1, entry2, options) @entry1 = entry1 @entry2 = entry2 + @overlap = 0 + @offset1 = 0 + @offset2 = 0 @options = options @options[:mismatches_max] ||= 0 @options[:overlap_min] ||= 1 @@ -47,41 +50,70 @@ class Assemble # Method to locate overlapping matches between two sequences. def match if @options[:overlap_max] - overlap = [@options[:overlap_max], @entry1.length, @entry2.length].min + @overlap = [@options[:overlap_max], @entry1.length, @entry2.length].min else - overlap = [@entry1.length, @entry2.length].min + @overlap = [@entry1.length, @entry2.length].min end - while overlap >= @options[:overlap_min] - mismatches_max = (overlap * @options[:mismatches_max] * 0.01).round - - offset1 = @entry2.length - overlap - offset2 = 0 + diff = @entry1.length - @entry2.length + diff = 0 if diff < 0 - if match_C(@entry1.seq, @entry2.seq, offset1, offset2, overlap, mismatches_max) - entry_left = @entry1[0 ... @entry1.length - overlap] - entry_right = @entry2[overlap .. -1] + @offset1 = @entry1.length - @overlap - diff - if @entry1.qual and @entry2.qual - entry_overlap1 = @entry1[-1 * overlap .. -1] - entry_overlap2 = @entry2[0 ... overlap] + while @overlap >= @options[:overlap_min] + mismatches_max = (@overlap * @options[:mismatches_max] * 0.01).round + + # puts "diff: #{diff} offset1: #{@offset1} offset2: #{@offset2} overlap: #{@overlap}" - entry_overlap = merge_overlap(entry_overlap1, entry_overlap2) - else - entry_overlap = @entry1[-1 * overlap .. -1] - end - - entry_left.seq.downcase! - entry_overlap.seq.upcase! - entry_right.seq.downcase! + if mismatches = match_C(@entry1.seq, @entry2.seq, @offset1, @offset2, @overlap, mismatches_max) and mismatches >= 0 entry_merged = entry_left + entry_overlap + entry_right - entry_merged.seq_name = @entry1.seq_name + ":overlap=#{overlap}" + entry_merged.seq_name = @entry1.seq_name + ":overlap=#{@overlap}:hamming=#{mismatches}" return entry_merged end - overlap -= 1 + if diff > 0 + diff -= 1 + else + @overlap -= 1 + end + + @offset1 += 1 + end + end + + # Method to extract and downcase the left part of an assembled pair. + def entry_left + entry = @entry1[0 ... @offset1] + entry.seq.downcase! + entry + end + + # Method to extract and downcase the right part of an assembled pair. + def entry_right + if @entry1.length > @offset1 + @overlap + entry = @entry1[@offset1 + @overlap .. -1] + else + entry = @entry2[@offset2 + @overlap .. -1] + end + + entry.seq.downcase! + entry + end + + # Method to extract and upcase the overlapping part of an assembled pair. + def entry_overlap + if @entry1.qual and @entry2.qual + entry_overlap1 = @entry1[@offset1 ... @offset1 + @overlap] + entry_overlap2 = @entry2[@offset2 ... @offset2 + @overlap] + + entry = merge_overlap(entry_overlap1, entry_overlap2) + else + entry = @entry1[@offset1 ... @offset1 + @overlap] end + + entry.seq.upcase! + entry end # Method to merge sequence and quality scores in an overlap. @@ -111,7 +143,7 @@ class Assemble # C method for determining if two strings of equal length match # given a maximum allowed mismatches and allowing for IUPAC - # ambiguity codes. Returns true if match, else false. + # ambiguity codes. Returns number of mismatches is true if match, else false. builder.c %{ VALUE match_C( VALUE _string1, // String 1 @@ -141,7 +173,7 @@ class Assemble match++; if (match >= max_match) { - return Qtrue; + return UINT2NUM(mismatch); } } else @@ -149,12 +181,12 @@ class Assemble mismatch++; if (mismatch > max_mismatch) { - return Qfalse; + return INT2NUM(-1); } } } - return Qfalse; + return INT2NUM(-1); } } end