added max_diversity swith to findsim.rb

author martinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>

Mon, 25 Jun 2012 09:01:26 +0000 (09:01 +0000)

committer martinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>

Mon, 25 Jun 2012 09:01:26 +0000 (09:01 +0000)
author martinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Mon, 25 Jun 2012 09:01:26 +0000 (09:01 +0000)
committer martinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Mon, 25 Jun 2012 09:01:26 +0000 (09:01 +0000)
diff --git a/code_ruby/lib/maasha/findsim.rb b/code_ruby/lib/maasha/findsim.rb

index 6b1ad23552ed1c434cfa161f042229102c7d8709..d33229d221bc3754f7a9a23673ad9dc384307e27 100644 (file)
--- a/code_ruby/lib/maasha/findsim.rb
+++ b/code_ruby/lib/maasha/findsim.rb
@@ -28,7 +28,8 @@ require 'maasha/align'
  
  BYTES_IN_INT   = 4
  BYTES_IN_FLOAT = 4
-BYTES_IN_HIT   = 12
+BYTES_IN_HIT   = 2 * BYTES_IN_INT + 1 * BYTES_IN_FLOAT   # i.e. 12
+NUC_ALPH_SIZE  = 4            # Alphabet size of nucleotides.
  RESULT_ARY_MAX = 50_000_000   # Maximum size for the result_ary.
  HIT_ARY_MAX    = 100_000      # Maximum size for the hit_ary.
  
@@ -100,9 +101,9 @@ class FindSim
    # locating for each sequence all shared oligos with the query index.
    def search_db(file)
      time           = Time.now
-    oligo_ary      = "\0" * (4 ** @opt_hash[:kmer]) * BYTES_IN_INT
-    shared_ary     = "\0" * @q_size                 * BYTES_IN_INT
-    result_ary     = "\0" * RESULT_ARY_MAX          * BYTES_IN_HIT
+    oligo_ary      = "\0" * (NUC_ALPH_SIZE ** @opt_hash[:kmer]) * BYTES_IN_INT
+    shared_ary     = "\0" * @q_size                             * BYTES_IN_INT
+    result_ary     = "\0" * RESULT_ARY_MAX                      * BYTES_IN_HIT
      result_count   = 0
  
      Fasta.open(file, 'r') do |ios|
@@ -110,8 +111,8 @@ class FindSim
          @s_ids     << entry.seq_name if @opt_hash[:subject_ids]
          @s_entries << entry          if @opt_hash[:realign]
  
-        zero_ary_c(oligo_ary,  (4 ** @opt_hash[:kmer]) * BYTES_IN_INT)
-        zero_ary_c(shared_ary, @q_size                 * BYTES_IN_INT)
+        zero_ary_c(oligo_ary,  (NUC_ALPH_SIZE ** @opt_hash[:kmer]) * BYTES_IN_INT)
+        zero_ary_c(shared_ary, @q_size                             * BYTES_IN_INT)
  
          oligo_ary_size = str_to_oligo_ary_c(entry.seq, entry.len, oligo_ary, @opt_hash[:kmer], @opt_hash[:step])
  
@@ -159,6 +160,11 @@ class FindSim
            score     = new_score if new_score > score
          end
  
+        if @opt_hash[:max_diversity]
+          best_score = score if i == 0
+          break if best_score - score > @opt_hash[:max_diversity]
+        end
+
          yield Hit.new(q_id, s_id, score)
        end
  
@@ -187,8 +193,8 @@ class FindSim
      @q_ary = ""
  
      beg        = 0
-    oligo_begs = Array.new(4 ** @opt_hash[:kmer], 0)
-    oligo_ends = Array.new(4 ** @opt_hash[:kmer], 0)
+    oligo_begs = Array.new(NUC_ALPH_SIZE ** @opt_hash[:kmer], 0)
+    oligo_ends = Array.new(NUC_ALPH_SIZE ** @opt_hash[:kmer], 0)
  
      oligo_hash.each do |oligo, list|
        @q_ary << list.pack("I*")
author	martinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
	Mon, 25 Jun 2012 09:01:26 +0000 (09:01 +0000)
committer	martinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
	Mon, 25 Jun 2012 09:01:26 +0000 (09:01 +0000)