]> git.donarmstrong.com Git - biopieces.git/blobdiff - code_ruby/lib/maasha/seq/assemble.rb
fixed bug in assemble_pairs
[biopieces.git] / code_ruby / lib / maasha / seq / assemble.rb
index de14b64c1fe080fdcf65c34dd61c6e3dafc9e51f..bab869c98b580a52cb1612b9d21b4add8760de10 100644 (file)
@@ -39,46 +39,83 @@ class Assemble
   def initialize(entry1, entry2, options)
     @entry1  = entry1
     @entry2  = entry2
+    @overlap = 0
+    @offset1 = 0
+    @offset2 = 0
     @options = options
     @options[:mismatches_max] ||= 0
     @options[:overlap_min]    ||= 1
-    @options[:overlap_max]    ||= entry1.length
-    @options[:overlap_max]      = [@options[:overlap_max], entry1.length, entry2.length].min
   end
 
   # Method to locate overlapping matches between two sequences.
   def match
-    overlap = @options[:overlap_max]
-
-    while overlap >= @options[:overlap_min]
-      mismatches_max = (overlap * @options[:mismatches_max] * 0.01).round
+    if @options[:overlap_max]
+      @overlap = [@options[:overlap_max], @entry1.length, @entry2.length].min
+    else
+      @overlap = [@entry1.length, @entry2.length].min
+    end
 
-      if match_C(@entry1.seq, @entry2.seq, @entry1.length - overlap, 0, overlap, mismatches_max)
-        entry_left  = @entry1[0 ... @entry1.length - overlap]
-        entry_right = @entry2[overlap .. -1]
+    diff = @entry1.length - @entry2.length
+    diff = 0 if diff < 0
 
-        if @entry1.qual and @entry2.qual
-          entry_overlap1 = @entry1[-1 * overlap .. -1]
-          entry_overlap2 = @entry2[0 ... overlap]
+    @offset1 = @entry1.length - @overlap - diff
 
-          entry_overlap = merge_overlap(entry_overlap1, entry_overlap2)
-        else
-          entry_overlap = @entry1[-1 * overlap .. -1]
-        end
+    while @overlap >= @options[:overlap_min]
+      mismatches_max = (@overlap * @options[:mismatches_max] * 0.01).round
+     
+      # puts "diff: #{diff}   offset1: #{@offset1}  offset2: #{@offset2}   overlap: #{@overlap}"
 
-        entry_left.seq.downcase!
-        entry_overlap.seq.upcase!
-        entry_right.seq.downcase!
+      if mismatches = match_C(@entry1.seq, @entry2.seq, @offset1, @offset2, @overlap, mismatches_max) and mismatches >= 0
         entry_merged          = entry_left + entry_overlap + entry_right
-        entry_merged.seq_name = @entry1.seq_name + ":overlap=#{overlap}"
+        entry_merged.seq_name = @entry1.seq_name + ":overlap=#{@overlap}:hamming=#{mismatches}"
 
         return entry_merged
       end
 
-      overlap -= 1
+      if diff > 0
+        diff -= 1
+      else
+        @overlap -= 1
+      end
+
+      @offset1 += 1
     end
   end
 
+  # Method to extract and downcase the left part of an assembled pair.
+  def entry_left
+    entry = @entry1[0 ... @offset1]
+    entry.seq.downcase!
+    entry
+  end
+
+  # Method to extract and downcase the right part of an assembled pair.
+  def entry_right
+    if @entry1.length > @offset1 + @overlap
+      entry = @entry1[@offset1 + @overlap .. -1]
+    else
+      entry = @entry2[@offset2 + @overlap .. -1]
+    end
+
+    entry.seq.downcase!
+    entry
+  end
+
+  # Method to extract and upcase the overlapping part of an assembled pair.
+  def entry_overlap
+    if @entry1.qual and @entry2.qual
+      entry_overlap1 = @entry1[@offset1 ... @offset1 + @overlap]
+      entry_overlap2 = @entry2[@offset2 ... @offset2 + @overlap]
+
+      entry = merge_overlap(entry_overlap1, entry_overlap2)
+    else
+      entry = @entry1[@offset1 ... @offset1 + @overlap]
+    end
+
+    entry.seq.upcase!
+    entry
+  end
+
   # Method to merge sequence and quality scores in an overlap.
   # The residue with the highest score at mismatch positions is selected.
   # The quality scores of the overlap are the mean of the two sequences.
@@ -106,7 +143,7 @@ class Assemble
 
     # C method for determining if two strings of equal length match
     # given a maximum allowed mismatches and allowing for IUPAC
-    # ambiguity codes. Returns true if match, else false.
+    # ambiguity codes. Returns number of mismatches is true if match, else false.
     builder.c %{
       VALUE match_C(
         VALUE _string1,       // String 1
@@ -136,7 +173,7 @@ class Assemble
             match++;
 
             if (match >= max_match) {
-              return Qtrue;
+              return UINT2NUM(mismatch);
             }
           }
           else
@@ -144,12 +181,12 @@ class Assemble
             mismatch++;
 
             if (mismatch > max_mismatch) {
-              return Qfalse;
+              return INT2NUM(-1);
             }
           }
         }
 
-        return Qfalse;
+        return INT2NUM(-1);
       }
     }
   end