]> git.donarmstrong.com Git - biopieces.git/blobdiff - code_ruby/lib/maasha/seq/assemble.rb
fixed bug in assemble_pairs
[biopieces.git] / code_ruby / lib / maasha / seq / assemble.rb
index e59e178b73df8ba45689836db02c852d5632f637..bab869c98b580a52cb1612b9d21b4add8760de10 100644 (file)
 
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
+require 'inline'
+require 'maasha/seq/ambiguity'
+
+# Class containing methods to assemble two overlapping sequences into a single.
 class Assemble
+  extend Ambiguity
+
+  # Class method to assemble two Seq objects.
   def self.pair(entry1, entry2, options = {})
     assemble = self.new(entry1, entry2, options)
     assemble.match
   end
 
+  # Method to initialize an Assembly object.
   def initialize(entry1, entry2, options)
     @entry1  = entry1
     @entry2  = entry2
+    @overlap = 0
+    @offset1 = 0
+    @offset2 = 0
     @options = options
     @options[:mismatches_max] ||= 0
     @options[:overlap_min]    ||= 1
-    @options[:overlap_max]    ||= entry1.length
-    @options[:overlap_max]      = [@options[:overlap_max], entry1.length, entry2.length].min
   end
 
+  # Method to locate overlapping matches between two sequences.
   def match
-    overlap = @options[:overlap_max]
+    if @options[:overlap_max]
+      @overlap = [@options[:overlap_max], @entry1.length, @entry2.length].min
+    else
+      @overlap = [@entry1.length, @entry2.length].min
+    end
 
-    na_seq1 = NArray.to_na(@entry1.seq.downcase, "byte")
-    na_seq2 = NArray.to_na(@entry2.seq.downcase, "byte")
+    diff = @entry1.length - @entry2.length
+    diff = 0 if diff < 0
 
-    while overlap >= @options[:overlap_min]
-      hamming_dist = (na_seq1[-1 * overlap .. -1] ^ na_seq2[0 ... overlap]).count_true
+    @offset1 = @entry1.length - @overlap - diff
 
-      if hamming_dist <= percent2real(overlap, @options[:mismatches_max])
-        merged = @entry1 + @entry2[overlap .. -1]
+    while @overlap >= @options[:overlap_min]
+      mismatches_max = (@overlap * @options[:mismatches_max] * 0.01).round
+     
+      # puts "diff: #{diff}   offset1: #{@offset1}  offset2: #{@offset2}   overlap: #{@overlap}"
 
-        if @entry1.qual and @entry2.qual
-          qual1 = @entry1.qual[@entry1.length - overlap .. -1]
-          qual2 = @entry2.qual[0 ... overlap]
+      if mismatches = match_C(@entry1.seq, @entry2.seq, @offset1, @offset2, @overlap, mismatches_max) and mismatches >= 0
+        entry_merged          = entry_left + entry_overlap + entry_right
+        entry_merged.seq_name = @entry1.seq_name + ":overlap=#{@overlap}:hamming=#{mismatches}"
 
-          na_qual1 = NArray.to_na(qual1, "byte")
-          na_qual2 = NArray.to_na(qual2, "byte")
+        return entry_merged
+      end
 
-          qual = ((na_qual1 + na_qual2) / 2).to_s
+      if diff > 0
+        diff -= 1
+      else
+        @overlap -= 1
+      end
 
-          merged.seq_name = @entry1.seq_name + ":overlap=#{overlap}:hamming=#{hamming_dist}"
-          merged.qual[@entry1.length - overlap ... @entry1.length]  = qual
-        end
+      @offset1 += 1
+    end
+  end
 
-        return merged
-      end
+  # Method to extract and downcase the left part of an assembled pair.
+  def entry_left
+    entry = @entry1[0 ... @offset1]
+    entry.seq.downcase!
+    entry
+  end
 
-      overlap -= 1
+  # Method to extract and downcase the right part of an assembled pair.
+  def entry_right
+    if @entry1.length > @offset1 + @overlap
+      entry = @entry1[@offset1 + @overlap .. -1]
+    else
+      entry = @entry2[@offset2 + @overlap .. -1]
     end
+
+    entry.seq.downcase!
+    entry
+  end
+
+  # Method to extract and upcase the overlapping part of an assembled pair.
+  def entry_overlap
+    if @entry1.qual and @entry2.qual
+      entry_overlap1 = @entry1[@offset1 ... @offset1 + @overlap]
+      entry_overlap2 = @entry2[@offset2 ... @offset2 + @overlap]
+
+      entry = merge_overlap(entry_overlap1, entry_overlap2)
+    else
+      entry = @entry1[@offset1 ... @offset1 + @overlap]
+    end
+
+    entry.seq.upcase!
+    entry
+  end
+
+  # Method to merge sequence and quality scores in an overlap.
+  # The residue with the highest score at mismatch positions is selected.
+  # The quality scores of the overlap are the mean of the two sequences.
+  def merge_overlap(entry_overlap1, entry_overlap2)
+    na_seq = NArray.byte(entry_overlap1.length, 2)
+    na_seq[true, 0] = NArray.to_na(entry_overlap1.seq.downcase, "byte")
+    na_seq[true, 1] = NArray.to_na(entry_overlap2.seq.downcase, "byte")
+
+    na_qual = NArray.byte(entry_overlap1.length, 2)
+    na_qual[true, 0] = NArray.to_na(entry_overlap1.qual, "byte")
+    na_qual[true, 1] = NArray.to_na(entry_overlap2.qual, "byte")
+
+    mask_xor = na_seq[true, 0] ^ na_seq[true, 1] > 0
+    mask_seq = ((na_qual * mask_xor).eq( (na_qual * mask_xor).max(1)))
+
+    merged      = Seq.new()
+    merged.seq  = (na_seq * mask_seq).max(1).to_s
+    merged.qual = na_qual.mean(1).round.to_type("byte").to_s
+
+    merged
   end
 
-  def percent2real(length, percent)
-      (length * percent * 0.01).round
+  inline do |builder|
+    add_ambiguity_macro(builder)
+
+    # C method for determining if two strings of equal length match
+    # given a maximum allowed mismatches and allowing for IUPAC
+    # ambiguity codes. Returns number of mismatches is true if match, else false.
+    builder.c %{
+      VALUE match_C(
+        VALUE _string1,       // String 1
+        VALUE _string2,       // String 2
+        VALUE _offset1,       // Offset 1
+        VALUE _offset2,       // Offset 2
+        VALUE _length,        // String length
+        VALUE _max_mismatch   // Maximum mismatches
+      )
+      {
+        char         *string1      = StringValuePtr(_string1);
+        char         *string2      = StringValuePtr(_string2);
+        unsigned int  offset1      = FIX2UINT(_offset1);
+        unsigned int  offset2      = FIX2UINT(_offset2);
+        unsigned int  length       = FIX2UINT(_length);
+        unsigned int  max_mismatch = FIX2UINT(_max_mismatch);
+
+        unsigned int max_match = length - max_mismatch;
+        unsigned int match     = 0;
+        unsigned int mismatch  = 0;
+        unsigned int i         = 0;
+
+        for (i = 0; i < length; i++)
+        {
+          if (MATCH(string1[i + offset1], string2[i + offset2]))
+          {
+            match++;
+
+            if (match >= max_match) {
+              return UINT2NUM(mismatch);
+            }
+          }
+          else
+          {
+            mismatch++;
+
+            if (mismatch > max_mismatch) {
+              return INT2NUM(-1);
+            }
+          }
+        }
+
+        return INT2NUM(-1);
+      }
+    }
   end
 end