require 'maasha/fasta'
require 'maasha/align'
-BYTES_IN_INT = 4
-BYTES_IN_FLOAT = 4
-BYTES_IN_HIT = 2 * BYTES_IN_INT + 1 * BYTES_IN_FLOAT # i.e. 12
-NUC_ALPH_SIZE = 4 # Alphabet size of nucleotides.
-RESULT_ARY_MAX = 50_000_000 # Maximum size for the result_ary.
-HIT_ARY_MAX = 100_000 # Maximum size for the hit_ary.
+BYTES_IN_INT = 4
+BYTES_IN_FLOAT = 4
+BYTES_IN_HIT = 2 * BYTES_IN_INT + 1 * BYTES_IN_FLOAT # i.e. 12
+NUC_ALPH_SIZE = 4 # Alphabet size of nucleotides.
+RESULT_ARY_BUFFER = 10_000_000 # Buffer for the result_ary.
+HIT_ARY_BUFFER = 1_000_000 # Buffer for the hit_ary.
# FindSim is an implementation of the SimRank logic proposed by Niels Larsen.
# The purpose is to find similarities between query DNA/RNA sequences and a
time = Time.now
oligo_ary = "\0" * (NUC_ALPH_SIZE ** @opt_hash[:kmer]) * BYTES_IN_INT
shared_ary = "\0" * @q_size * BYTES_IN_INT
- result_ary = "\0" * RESULT_ARY_MAX * BYTES_IN_HIT
+ result_ary = "\0" * RESULT_ARY_BUFFER * BYTES_IN_HIT
result_count = 0
Fasta.open(file, 'r') do |ios|
if ((s_index + 1) % 1000) == 0 and @opt_hash[:verbose]
$stderr.puts "Searched #{s_index + 1} sequences in #{Time.now - time} seconds (#{result_count} hits)."
end
+
+ if result_ary.size / BYTES_IN_HIT - result_count < RESULT_ARY_BUFFER / 2
+ result_ary << "\0" * RESULT_ARY_BUFFER * BYTES_IN_HIT
+ end
end
end
def each
sort_hits_c(@result, @result_count)
- hit_ary = "\0" * HIT_ARY_MAX * BYTES_IN_HIT
+ hit_ary = "\0" * HIT_ARY_BUFFER * BYTES_IN_HIT
hit_index = 0
(0 ... @q_size).each do |q_index|
- zero_ary_c(hit_ary, HIT_ARY_MAX * BYTES_IN_HIT)
+ zero_ary_c(hit_ary, HIT_ARY_BUFFER * BYTES_IN_HIT)
hit_ary_size = get_hits_c(@result, @result_count, hit_index, hit_ary, q_index)
if @opt_hash[:max_hits]