require 'maasha/fasta'
require 'maasha/align'
-BYTES_IN_INT = 4
-BYTES_IN_FLOAT = 4
-BYTES_IN_HIT = 2 * BYTES_IN_INT + 1 * BYTES_IN_FLOAT # i.e. 12
-NUC_ALPH_SIZE = 4 # Alphabet size of nucleotides.
-RESULT_ARY_MAX = 50_000_000 # Maximum size for the result_ary.
-HIT_ARY_MAX = 100_000 # Maximum size for the hit_ary.
+BYTES_IN_INT = 4
+BYTES_IN_FLOAT = 4
+BYTES_IN_HIT = 2 * BYTES_IN_INT + 1 * BYTES_IN_FLOAT # i.e. 12
+NUC_ALPH_SIZE = 4 # Alphabet size of nucleotides.
+RESULT_ARY_BUFFER = 10_000_000 # Buffer for the result_ary.
+HIT_ARY_BUFFER = 1_000_000 # Buffer for the hit_ary.
# FindSim is an implementation of the SimRank logic proposed by Niels Larsen.
# The purpose is to find similarities between query DNA/RNA sequences and a
# divided by the smallest number of unique oligoes in either the query or
# database sequence. This yields a rough under estimate of similarity e.g. 50%
# oligo similarity may correspond to 80% similarity on a nucleotide level
-# (needs clarification). The outcome of FindSim is a table with a row per
-# query sequence and the columns are the database hits sorted according to
-# similarity.
-#
-# Extensive use of inline C for speed.
+# (needs clarification).
class FindSim
include Enumerable
# Method to search database or subject sequences from a FASTA file by
# locating for each sequence all shared oligos with the query index.
def search_db(file)
- time = Time.now
- oligo_ary = "\0" * (NUC_ALPH_SIZE ** @opt_hash[:kmer]) * BYTES_IN_INT
- shared_ary = "\0" * @q_size * BYTES_IN_INT
- result_ary = "\0" * RESULT_ARY_MAX * BYTES_IN_HIT
- result_count = 0
+ time = Time.now
+ oligo_ary = "\0" * (NUC_ALPH_SIZE ** @opt_hash[:kmer]) * BYTES_IN_INT
+ shared_ary = "\0" * @q_size * BYTES_IN_INT
+ result_ary = "\0" * RESULT_ARY_BUFFER * BYTES_IN_HIT
+ result_count = 0
Fasta.open(file, 'r') do |ios|
ios.each_with_index do |entry, s_index|
if ((s_index + 1) % 1000) == 0 and @opt_hash[:verbose]
$stderr.puts "Searched #{s_index + 1} sequences in #{Time.now - time} seconds (#{result_count} hits)."
end
+
+ if result_ary.size / BYTES_IN_HIT - result_count < RESULT_ARY_BUFFER / 2
+ result_ary << "\0" * RESULT_ARY_BUFFER * BYTES_IN_HIT
+ end
end
end
def each
sort_hits_c(@result, @result_count)
- hit_ary = "\0" * HIT_ARY_MAX * BYTES_IN_HIT
+ hit_ary = "\0" * HIT_ARY_BUFFER * BYTES_IN_HIT
hit_index = 0
(0 ... @q_size).each do |q_index|
- zero_ary_c(hit_ary, HIT_ARY_MAX * BYTES_IN_HIT)
+ zero_ary_c(hit_ary, HIT_ARY_BUFFER * BYTES_IN_HIT)
hit_ary_size = get_hits_c(@result, @result_count, hit_index, hit_ary, q_index)
if @opt_hash[:max_hits]
max = hit_ary_size
end
+ best_score = 0
+
(0 ... max).each do |i|
q_index, s_index, score = hit_ary[BYTES_IN_HIT * i ... BYTES_IN_HIT * i + BYTES_IN_HIT].unpack("IIF")
score = new_score if new_score > score
end
- best_score = 0
-
if @opt_hash[:max_diversity]
best_score = score if i == 0
- break if best_score - score > @opt_hash[:max_diversity]
+
+ break if best_score - score >= (@opt_hash[:max_diversity] / 100)
end
yield Hit.new(q_id, s_id, score)
# particular oligo.
def create_query_index(q_total, oligo_hash)
@q_total_ary = q_total.pack("I*")
-
- @q_ary = ""
+ @q_ary = ""
beg = 0
oligo_begs = Array.new(NUC_ALPH_SIZE ** @opt_hash[:kmer], 0)
VALUE _step // Step size for overlapping kmers.
)
{
- unsigned char *str = StringValuePtr(_str);
+ char *str = StringValuePtr(_str);
unsigned int str_size = FIX2UINT(_str_size);
unsigned int *ary = (unsigned int *) StringValuePtr(_ary);
unsigned int kmer = FIX2UINT(_kmer);
# Method that counts all shared oligos/kmers between a subject sequence and
# all query sequences. For each oligo in the subject sequence (s_ary) the
- # index of all query sequences containing this oligo is found the the q_ary
+ # index of all query sequences containing this oligo is found for the q_ary
# where this information is stored sequentially in intervals. For the
# particula oligo the interval is looked up in the q_beg and q_end arrays.
# Shared oligos are recorded in the shared_ary.