require 'inline'
require 'maasha/fasta'
+require 'maasha/align'
-BYTES_IN_INT = 4
-BYTES_IN_FLOAT = 4
-BYTES_IN_HIT = 12
-RESULT_ARY_MAX = 50_000_000 # Maximum size for the result_ary.
-HIT_ARY_MAX = 100_000 # Maximum size for the hit_ary.
+BYTES_IN_INT = 4
+BYTES_IN_FLOAT = 4
+BYTES_IN_HIT = 2 * BYTES_IN_INT + 1 * BYTES_IN_FLOAT # i.e. 12
+NUC_ALPH_SIZE = 4 # Alphabet size of nucleotides.
+RESULT_ARY_BUFFER = 10_000_000 # Buffer for the result_ary.
+HIT_ARY_BUFFER = 1_000_000 # Buffer for the hit_ary.
# FindSim is an implementation of the SimRank logic proposed by Niels Larsen.
# The purpose is to find similarities between query DNA/RNA sequences and a
# divided by the smallest number of unique oligoes in either the query or
# database sequence. This yields a rough under estimate of similarity e.g. 50%
# oligo similarity may correspond to 80% similarity on a nucleotide level
-# (needs clarification). The outcome of FindSim is a table with a row per
-# query sequence and the columns are the database hits sorted according to
-# similarity.
-#
-# Extensive use of inline C for speed.
+# (needs clarification).
class FindSim
include Enumerable
@result_count = 0
@q_ids = []
@s_ids = []
+ @q_entries = []
+ @s_entries = []
end
# Method to load sequences from a query file in FASTA format
Fasta.open(file, 'r') do |ios|
ios.each do |entry|
- @q_ids << entry.seq_name if @opt_hash[:query_ids]
+ @q_ids << entry.seq_name if @opt_hash[:query_ids]
+ @q_entries << entry if @opt_hash[:realign]
oligos = str_to_oligo_rb_ary_c(entry.seq, @opt_hash[:kmer], 1).uniq.sort
# Method to search database or subject sequences from a FASTA file by
# locating for each sequence all shared oligos with the query index.
def search_db(file)
- time = Time.now
- oligo_ary = "\0" * (4 ** @opt_hash[:kmer]) * BYTES_IN_INT
- shared_ary = "\0" * @q_size * BYTES_IN_INT
- result_ary = "\0" * RESULT_ARY_MAX * BYTES_IN_HIT
- result_count = 0
+ time = Time.now
+ oligo_ary = "\0" * (NUC_ALPH_SIZE ** @opt_hash[:kmer]) * BYTES_IN_INT
+ shared_ary = "\0" * @q_size * BYTES_IN_INT
+ result_ary = "\0" * RESULT_ARY_BUFFER * BYTES_IN_HIT
+ result_count = 0
Fasta.open(file, 'r') do |ios|
ios.each_with_index do |entry, s_index|
- @s_ids << entry.seq_name if @opt_hash[:subject_ids]
+ @s_ids << entry.seq_name if @opt_hash[:subject_ids]
+ @s_entries << entry if @opt_hash[:realign]
- zero_ary_c(oligo_ary, (4 ** @opt_hash[:kmer]) * BYTES_IN_INT)
- zero_ary_c(shared_ary, @q_size * BYTES_IN_INT)
+ zero_ary_c(oligo_ary, (NUC_ALPH_SIZE ** @opt_hash[:kmer]) * BYTES_IN_INT)
+ zero_ary_c(shared_ary, @q_size * BYTES_IN_INT)
oligo_ary_size = str_to_oligo_ary_c(entry.seq, entry.len, oligo_ary, @opt_hash[:kmer], @opt_hash[:step])
if ((s_index + 1) % 1000) == 0 and @opt_hash[:verbose]
$stderr.puts "Searched #{s_index + 1} sequences in #{Time.now - time} seconds (#{result_count} hits)."
end
+
+ if result_ary.size / BYTES_IN_HIT - result_count < RESULT_ARY_BUFFER / 2
+ result_ary << "\0" * RESULT_ARY_BUFFER * BYTES_IN_HIT
+ end
end
end
def each
sort_hits_c(@result, @result_count)
- hit_ary = "\0" * HIT_ARY_MAX * BYTES_IN_HIT
+ hit_ary = "\0" * HIT_ARY_BUFFER * BYTES_IN_HIT
hit_index = 0
(0 ... @q_size).each do |q_index|
- zero_ary_c(hit_ary, HIT_ARY_MAX * BYTES_IN_HIT)
+ zero_ary_c(hit_ary, HIT_ARY_BUFFER * BYTES_IN_HIT)
hit_ary_size = get_hits_c(@result, @result_count, hit_index, hit_ary, q_index)
- if @opt_hash[:report_scores]
- max = (hit_ary_size > @opt_hash[:report_scores]) ? @opt_hash[:report_scores] : hit_ary_size
+ if @opt_hash[:max_hits]
+ max = (hit_ary_size > @opt_hash[:max_hits]) ? @opt_hash[:max_hits] : hit_ary_size
else
max = hit_ary_size
end
+ best_score = 0
+
(0 ... max).each do |i|
q_index, s_index, score = hit_ary[BYTES_IN_HIT * i ... BYTES_IN_HIT * i + BYTES_IN_HIT].unpack("IIF")
q_id = @opt_hash[:query_ids] ? @q_ids[q_index] : q_index
s_id = @opt_hash[:subject_ids] ? @s_ids[s_index] : s_index
+ if @opt_hash[:realign]
+ new_score = Align.muscle([@q_entries[q_index], @s_entries[s_index]]).identity
+ score = new_score if new_score > score
+ end
+
+ if @opt_hash[:max_diversity]
+ best_score = score if i == 0
+
+ break if best_score - score >= (@opt_hash[:max_diversity] / 100)
+ end
+
yield Hit.new(q_id, s_id, score)
end
# particular oligo.
def create_query_index(q_total, oligo_hash)
@q_total_ary = q_total.pack("I*")
-
- @q_ary = ""
+ @q_ary = ""
beg = 0
- oligo_begs = Array.new(4 ** @opt_hash[:kmer], 0)
- oligo_ends = Array.new(4 ** @opt_hash[:kmer], 0)
+ oligo_begs = Array.new(NUC_ALPH_SIZE ** @opt_hash[:kmer], 0)
+ oligo_ends = Array.new(NUC_ALPH_SIZE ** @opt_hash[:kmer], 0)
oligo_hash.each do |oligo, list|
@q_ary << list.pack("I*")
VALUE _step // Step size for overlapping kmers.
)
{
- unsigned char *str = StringValuePtr(_str);
+ char *str = StringValuePtr(_str);
unsigned int str_size = FIX2UINT(_str_size);
unsigned int *ary = (unsigned int *) StringValuePtr(_ary);
unsigned int kmer = FIX2UINT(_kmer);
# Method that counts all shared oligos/kmers between a subject sequence and
# all query sequences. For each oligo in the subject sequence (s_ary) the
- # index of all query sequences containing this oligo is found the the q_ary
+ # index of all query sequences containing this oligo is found for the q_ary
# where this information is stored sequentially in intervals. For the
# particula oligo the interval is looked up in the q_beg and q_end arrays.
# Shared oligos are recorded in the shared_ary.
def to_s
"#{@q_id}:#{@s_id}:#{@score.round(2)}"
end
+
+ # Method to convert to a Biopiece record.
+ def to_bp
+ hash = {}
+ hash[:REC_TYPE] = "findsim"
+ hash[:Q_ID] = @q_id
+ hash[:S_ID] = @s_id
+ hash[:SCORE] = @score.round(2)
+
+ hash
+ end
end
end