# divided by the smallest number of unique oligoes in either the query or
# database sequence. This yields a rough under estimate of similarity e.g. 50%
# oligo similarity may correspond to 80% similarity on a nucleotide level
-# (needs clarification). The outcome of FindSim is a table with a row per
-# query sequence and the columns are the database hits sorted according to
-# similarity.
-#
-# Extensive use of inline C for speed.
+# (needs clarification).
class FindSim
include Enumerable
# Method to search database or subject sequences from a FASTA file by
# locating for each sequence all shared oligos with the query index.
def search_db(file)
- time = Time.now
- oligo_ary = "\0" * (NUC_ALPH_SIZE ** @opt_hash[:kmer]) * BYTES_IN_INT
- shared_ary = "\0" * @q_size * BYTES_IN_INT
- result_ary = "\0" * RESULT_ARY_MAX * BYTES_IN_HIT
- result_count = 0
+ time = Time.now
+ oligo_ary = "\0" * (NUC_ALPH_SIZE ** @opt_hash[:kmer]) * BYTES_IN_INT
+ shared_ary = "\0" * @q_size * BYTES_IN_INT
+ result_ary = "\0" * RESULT_ARY_MAX * BYTES_IN_HIT
+ result_count = 0
Fasta.open(file, 'r') do |ios|
ios.each_with_index do |entry, s_index|
# particular oligo.
def create_query_index(q_total, oligo_hash)
@q_total_ary = q_total.pack("I*")
-
- @q_ary = ""
+ @q_ary = ""
beg = 0
oligo_begs = Array.new(NUC_ALPH_SIZE ** @opt_hash[:kmer], 0)
# Method that counts all shared oligos/kmers between a subject sequence and
# all query sequences. For each oligo in the subject sequence (s_ary) the
- # index of all query sequences containing this oligo is found the the q_ary
+ # index of all query sequences containing this oligo is found for the q_ary
# where this information is stored sequentially in intervals. For the
# particula oligo the interval is looked up in the q_beg and q_end arrays.
# Shared oligos are recorded in the shared_ary.