add comments to ambiguity.rb

[biopieces.git] / code_ruby / lib / maasha / findsim.rb
diff --git a/code_ruby/lib/maasha/findsim.rb b/code_ruby/lib/maasha/findsim.rb

index 7e5d100f4c4f03b78d6c6d987b2b6103785ba7a1..58fab98d70e42069deffda3a095d96ae1c4610e8 100644 (file)
--- a/code_ruby/lib/maasha/findsim.rb
+++ b/code_ruby/lib/maasha/findsim.rb
@@ -26,12 +26,12 @@ require 'inline'
  require 'maasha/fasta'
  require 'maasha/align'
  
-BYTES_IN_INT   = 4
-BYTES_IN_FLOAT = 4
-BYTES_IN_HIT   = 2 * BYTES_IN_INT + 1 * BYTES_IN_FLOAT   # i.e. 12
-NUC_ALPH_SIZE  = 4            # Alphabet size of nucleotides.
-RESULT_ARY_MAX = 50_000_000   # Maximum size for the result_ary.
-HIT_ARY_MAX    = 100_000      # Maximum size for the hit_ary.
+BYTES_IN_INT      = 4
+BYTES_IN_FLOAT    = 4
+BYTES_IN_HIT      = 2 * BYTES_IN_INT + 1 * BYTES_IN_FLOAT   # i.e. 12
+NUC_ALPH_SIZE     = 4            # Alphabet size of nucleotides.
+RESULT_ARY_BUFFER = 10_000_000   # Buffer for the result_ary.
+HIT_ARY_BUFFER    = 1_000_000    # Buffer for the hit_ary.
  
  # FindSim is an implementation of the SimRank logic proposed by Niels Larsen.
  # The purpose is to find similarities between query DNA/RNA sequences and a
@@ -40,11 +40,7 @@ HIT_ARY_MAX    = 100_000      # Maximum size for the hit_ary.
  # divided by the smallest number of unique oligoes in either the query or
  # database sequence. This yields a rough under estimate of similarity e.g. 50%
  # oligo similarity may correspond to 80% similarity on a nucleotide level
-# (needs clarification). The outcome of FindSim is a table with a row per
-# query sequence and the columns are the database hits sorted according to
-# similarity.
-#
-# Extensive use of inline C for speed.
+# (needs clarification). 
  class FindSim
    include Enumerable
  
@@ -100,11 +96,11 @@ class FindSim
    # Method to search database or subject sequences from a FASTA file by
    # locating for each sequence all shared oligos with the query index.
    def search_db(file)
-    time           = Time.now
-    oligo_ary      = "\0" * (NUC_ALPH_SIZE ** @opt_hash[:kmer]) * BYTES_IN_INT
-    shared_ary     = "\0" * @q_size                             * BYTES_IN_INT
-    result_ary     = "\0" * RESULT_ARY_MAX                      * BYTES_IN_HIT
-    result_count   = 0
+    time         = Time.now
+    oligo_ary    = "\0" * (NUC_ALPH_SIZE ** @opt_hash[:kmer]) * BYTES_IN_INT
+    shared_ary   = "\0" * @q_size                             * BYTES_IN_INT
+    result_ary   = "\0" * RESULT_ARY_BUFFER                   * BYTES_IN_HIT
+    result_count = 0
  
      Fasta.open(file, 'r') do |ios|
        ios.each_with_index do |entry, s_index|
@@ -123,6 +119,10 @@ class FindSim
          if ((s_index + 1) % 1000) == 0 and @opt_hash[:verbose]
            $stderr.puts "Searched #{s_index + 1} sequences in #{Time.now - time} seconds (#{result_count} hits)."
          end
+
+        if result_ary.size / BYTES_IN_HIT - result_count < RESULT_ARY_BUFFER / 2
+          result_ary << "\0" * RESULT_ARY_BUFFER * BYTES_IN_HIT
+        end
        end
      end
  
@@ -135,12 +135,12 @@ class FindSim
    def each
      sort_hits_c(@result, @result_count)
  
-    hit_ary = "\0" * HIT_ARY_MAX * BYTES_IN_HIT
+    hit_ary = "\0" * HIT_ARY_BUFFER * BYTES_IN_HIT
  
      hit_index = 0
  
      (0 ... @q_size).each do |q_index|
-      zero_ary_c(hit_ary, HIT_ARY_MAX * BYTES_IN_HIT)
+      zero_ary_c(hit_ary, HIT_ARY_BUFFER * BYTES_IN_HIT)
        hit_ary_size = get_hits_c(@result, @result_count, hit_index, hit_ary, q_index)
  
        if @opt_hash[:max_hits]
@@ -149,6 +149,8 @@ class FindSim
          max = hit_ary_size
        end
  
+      best_score = 0
+
        (0 ... max).each do |i|
          q_index, s_index, score = hit_ary[BYTES_IN_HIT * i ... BYTES_IN_HIT * i + BYTES_IN_HIT].unpack("IIF")
  
@@ -160,11 +162,10 @@ class FindSim
            score     = new_score if new_score > score
          end
  
-        best_score = 0
-
          if @opt_hash[:max_diversity]
            best_score = score if i == 0
-          break if best_score - score > @opt_hash[:max_diversity]
+
+          break if best_score - score >= (@opt_hash[:max_diversity] / 100)
          end
  
          yield Hit.new(q_id, s_id, score)
@@ -191,8 +192,7 @@ class FindSim
    #    particular oligo.
    def create_query_index(q_total, oligo_hash)
      @q_total_ary = q_total.pack("I*")
-
-    @q_ary = ""
+    @q_ary       = ""
  
      beg        = 0
      oligo_begs = Array.new(NUC_ALPH_SIZE ** @opt_hash[:kmer], 0)
@@ -355,7 +355,7 @@ class FindSim
          VALUE _step        // Step size for overlapping kmers.
        )
        {
-        unsigned char *str      = StringValuePtr(_str);
+        char *str               = StringValuePtr(_str);
          unsigned int   str_size = FIX2UINT(_str_size);
          unsigned int  *ary      = (unsigned int *) StringValuePtr(_ary);
          unsigned int   kmer     = FIX2UINT(_kmer);
@@ -436,7 +436,7 @@ class FindSim
  
      # Method that counts all shared oligos/kmers between a subject sequence and
      # all query sequences. For each oligo in the subject sequence (s_ary) the
-    # index of all query sequences containing this oligo is found the the q_ary
+    # index of all query sequences containing this oligo is found for the q_ary
      # where this information is stored sequentially in intervals. For the
      # particula oligo the interval is looked up in the q_beg and q_end arrays.
      # Shared oligos are recorded in the shared_ary.