]> git.donarmstrong.com Git - biopieces.git/blobdiff - code_ruby/lib/maasha/seq/assemble.rb
refactoring of assemble_pairs
[biopieces.git] / code_ruby / lib / maasha / seq / assemble.rb
index 016322727cf15eb824f9e3f0518cca9b2ecb0241..de14b64c1fe080fdcf65c34dd61c6e3dafc9e51f 100644 (file)
 
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
+require 'inline'
+require 'maasha/seq/ambiguity'
+
 # Class containing methods to assemble two overlapping sequences into a single.
 class Assemble
+  extend Ambiguity
+
   # Class method to assemble two Seq objects.
   def self.pair(entry1, entry2, options = {})
     assemble = self.new(entry1, entry2, options)
@@ -41,17 +46,14 @@ class Assemble
     @options[:overlap_max]      = [@options[:overlap_max], entry1.length, entry2.length].min
   end
 
-  # Method to locate overlapping matche between two sequences.
+  # Method to locate overlapping matches between two sequences.
   def match
     overlap = @options[:overlap_max]
 
-    na_seq1 = NArray.to_na(@entry1.seq.downcase, "byte")
-    na_seq2 = NArray.to_na(@entry2.seq.downcase, "byte")
-
     while overlap >= @options[:overlap_min]
-      hamming_dist = (na_seq1[-1 * overlap .. -1] ^ na_seq2[0 ... overlap]).count_true
+      mismatches_max = (overlap * @options[:mismatches_max] * 0.01).round
 
-      if hamming_dist <= (overlap * @options[:mismatches_max] * 0.01).round
+      if match_C(@entry1.seq, @entry2.seq, @entry1.length - overlap, 0, overlap, mismatches_max)
         entry_left  = @entry1[0 ... @entry1.length - overlap]
         entry_right = @entry2[overlap .. -1]
 
@@ -68,7 +70,7 @@ class Assemble
         entry_overlap.seq.upcase!
         entry_right.seq.downcase!
         entry_merged          = entry_left + entry_overlap + entry_right
-        entry_merged.seq_name = @entry1.seq_name + ":overlap=#{overlap}:hamming=#{hamming_dist}"
+        entry_merged.seq_name = @entry1.seq_name + ":overlap=#{overlap}"
 
         return entry_merged
       end
@@ -98,6 +100,59 @@ class Assemble
 
     merged
   end
+
+  inline do |builder|
+    add_ambiguity_macro(builder)
+
+    # C method for determining if two strings of equal length match
+    # given a maximum allowed mismatches and allowing for IUPAC
+    # ambiguity codes. Returns true if match, else false.
+    builder.c %{
+      VALUE match_C(
+        VALUE _string1,       // String 1
+        VALUE _string2,       // String 2
+        VALUE _offset1,       // Offset 1
+        VALUE _offset2,       // Offset 2
+        VALUE _length,        // String length
+        VALUE _max_mismatch   // Maximum mismatches
+      )
+      {
+        char         *string1      = StringValuePtr(_string1);
+        char         *string2      = StringValuePtr(_string2);
+        unsigned int  offset1      = FIX2UINT(_offset1);
+        unsigned int  offset2      = FIX2UINT(_offset2);
+        unsigned int  length       = FIX2UINT(_length);
+        unsigned int  max_mismatch = FIX2UINT(_max_mismatch);
+
+        unsigned int max_match = length - max_mismatch;
+        unsigned int match     = 0;
+        unsigned int mismatch  = 0;
+        unsigned int i         = 0;
+
+        for (i = 0; i < length; i++)
+        {
+          if (MATCH(string1[i + offset1], string2[i + offset2]))
+          {
+            match++;
+
+            if (match >= max_match) {
+              return Qtrue;
+            }
+          }
+          else
+          {
+            mismatch++;
+
+            if (mismatch > max_mismatch) {
+              return Qfalse;
+            }
+          }
+        }
+
+        return Qfalse;
+      }
+    }
+  end
 end