1 # Copyright (C) 2007-2012 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
26 require 'maasha/fasta'
27 require 'maasha/align'
31 BYTES_IN_HIT = 2 * BYTES_IN_INT + 1 * BYTES_IN_FLOAT # i.e. 12
32 NUC_ALPH_SIZE = 4 # Alphabet size of nucleotides.
33 RESULT_ARY_BUFFER = 10_000_000 # Buffer for the result_ary.
34 HIT_ARY_BUFFER = 1_000_000 # Buffer for the hit_ary.
36 # FindSim is an implementation of the SimRank logic proposed by Niels Larsen.
37 # The purpose is to find similarities between query DNA/RNA sequences and a
38 # database of small subunit ribosomal DNA/RNA sequences. The sequence
39 # similarities are calculated as the number of shared unique oligos/kmers
40 # divided by the smallest number of unique oligoes in either the query or
41 # database sequence. This yields a rough under estimate of similarity e.g. 50%
42 # oligo similarity may correspond to 80% similarity on a nucleotide level
43 # (needs clarification).
47 # Method to initialize FindSim Object.
48 def initialize(opt_hash)
63 # Method to load sequences from a query file in FASTA format
64 # and index these for oligo based similarty search.
68 oligo_hash = Hash.new { |h, k| h[k] = [] }
72 Fasta.open(file, 'r') do |ios|
74 @q_ids << entry.seq_name if @opt_hash[:query_ids]
75 @q_entries << entry if @opt_hash[:realign]
77 oligos = str_to_oligo_rb_ary_c(entry.seq, @opt_hash[:kmer], 1).uniq.sort
79 q_total << oligos.size
81 oligos.each do |oligo|
82 oligo_hash[oligo] << count
91 create_query_index(q_total, oligo_hash)
93 $stderr.puts "Loaded #{count} query sequences in #{(Time.now - time)} seconds." if @opt_hash[:verbose]
96 # Method to search database or subject sequences from a FASTA file by
97 # locating for each sequence all shared oligos with the query index.
100 oligo_ary = "\0" * (NUC_ALPH_SIZE ** @opt_hash[:kmer]) * BYTES_IN_INT
101 shared_ary = "\0" * @q_size * BYTES_IN_INT
102 result_ary = "\0" * RESULT_ARY_BUFFER * BYTES_IN_HIT
105 Fasta.open(file, 'r') do |ios|
106 ios.each_with_index do |entry, s_index|
107 @s_ids << entry.seq_name if @opt_hash[:subject_ids]
108 @s_entries << entry if @opt_hash[:realign]
110 zero_ary_c(oligo_ary, (NUC_ALPH_SIZE ** @opt_hash[:kmer]) * BYTES_IN_INT)
111 zero_ary_c(shared_ary, @q_size * BYTES_IN_INT)
113 oligo_ary_size = str_to_oligo_ary_c(entry.seq, entry.len, oligo_ary, @opt_hash[:kmer], @opt_hash[:step])
115 count_shared_c(@q_ary, @q_begs_ary, @q_ends_ary, oligo_ary, oligo_ary_size, shared_ary)
117 result_count = calc_scores_c(@q_total_ary, shared_ary, @q_size, oligo_ary_size, result_ary, result_count, s_index, @opt_hash[:min_score])
119 if ((s_index + 1) % 1000) == 0 and @opt_hash[:verbose]
120 $stderr.puts "Searched #{s_index + 1} sequences in #{Time.now - time} seconds (#{result_count} hits)."
123 if result_ary.size / BYTES_IN_HIT - result_count < RESULT_ARY_BUFFER / 2
124 result_ary << "\0" * RESULT_ARY_BUFFER * BYTES_IN_HIT
129 @result = result_ary[0 ... result_count * BYTES_IN_HIT]
130 @result_count = result_count
133 # Method that for each query index yields all hits, sorted according to
134 # decending score, as Hit objects.
136 sort_hits_c(@result, @result_count)
138 hit_ary = "\0" * HIT_ARY_BUFFER * BYTES_IN_HIT
142 (0 ... @q_size).each do |q_index|
143 zero_ary_c(hit_ary, HIT_ARY_BUFFER * BYTES_IN_HIT)
144 hit_ary_size = get_hits_c(@result, @result_count, hit_index, hit_ary, q_index)
146 if @opt_hash[:max_hits]
147 max = (hit_ary_size > @opt_hash[:max_hits]) ? @opt_hash[:max_hits] : hit_ary_size
154 (0 ... max).each do |i|
155 q_index, s_index, score = hit_ary[BYTES_IN_HIT * i ... BYTES_IN_HIT * i + BYTES_IN_HIT].unpack("IIF")
157 q_id = @opt_hash[:query_ids] ? @q_ids[q_index] : q_index
158 s_id = @opt_hash[:subject_ids] ? @s_ids[s_index] : s_index
160 if @opt_hash[:realign]
161 new_score = Align.muscle([@q_entries[q_index], @s_entries[s_index]]).identity
162 score = new_score if new_score > score
165 if @opt_hash[:max_diversity]
166 best_score = score if i == 0
168 break if best_score - score >= (@opt_hash[:max_diversity] / 100)
171 yield Hit.new(q_id, s_id, score)
174 hit_index += hit_ary_size
182 # Method to create the query index for FindSim search. The index consists of
183 # four lookup arrays which are stored as instance variables:
184 # * @q_total_ary holds per index the total of unique oligos for that
185 # particular query sequences.
186 # * @q_ary holds sequencially lists of query indexes so it is possible
187 # to lookup the list for a particular oligo and get all query indeces for
188 # query sequences containing this oligo.
189 # * @q_begs_ary holds per index the begin coordinate for the @q_ary list for
190 # a particular oligo.
191 # * @q_ends_ary holds per index the end coordinate for the @q_ary list for a
193 def create_query_index(q_total, oligo_hash)
194 @q_total_ary = q_total.pack("I*")
198 oligo_begs = Array.new(NUC_ALPH_SIZE ** @opt_hash[:kmer], 0)
199 oligo_ends = Array.new(NUC_ALPH_SIZE ** @opt_hash[:kmer], 0)
201 oligo_hash.each do |oligo, list|
202 @q_ary << list.pack("I*")
203 oligo_begs[oligo] = beg
204 oligo_ends[oligo] = beg + list.size
209 @q_begs_ary = oligo_begs.pack("I*")
210 @q_ends_ary = oligo_ends.pack("I*")
213 # >>>>>>>>>>>>>>> RubyInline C code <<<<<<<<<<<<<<<
216 # Bitmap lookup table where the index is ASCII values
217 # and the following nucleotides are encoded as:
223 unsigned char oligo_map[256] = {
224 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
238 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
243 # Defining a hit struct to hold hits consisting of query
244 # sequence index, subject sequence index and score.
248 unsigned int q_index;
249 unsigned int s_index;
254 # Qsort unsigned int comparison function.
255 # Returns negative if b > a and positive if a > b.
257 int uint_cmp(const void *a, const void *b)
259 const unsigned int *ia = (const unsigned int *) a;
260 const unsigned int *ib = (const unsigned int *) b;
266 # Qsort hit struct comparision function of q_index.
267 # Returns negative if b > a and positive if a > b.
269 int hit_cmp_by_q_index(const void *a, const void *b)
274 return (int) (ia->q_index - ib->q_index);
278 # Qsort hit struct comparision function of score.
279 # Returns negative if b < a and positive if a < b.
280 # We multiply with 1000 to maintain 2 decimal floats.
282 int hit_cmp_by_score(const void *a, const void *b)
287 return (int) (1000 * ib->score - 1000 * ia->score);
291 # Given a sorted unsigned integer removes all duplicate values
292 # and move the unique values to the front of the array. Returns
293 # the new size of the array.
295 unsigned int uniq_ary(unsigned int *ary, unsigned int ary_size)
297 unsigned int new_ary_size = 0;
301 for (i = 1; i < ary_size; i++)
303 if (ary[i] != ary[j])
306 ary[j] = ary[i]; // Move it to the front
310 new_ary_size = j + 1;
316 # Method that given a byte array and its size in bytes
317 # sets all bytes to 0.
320 VALUE _ary, // Byte array to zero.
321 VALUE _ary_size // Size of array.
324 char *ary = (char *) StringValuePtr(_ary);
325 unsigned int ary_size = FIX2UINT(_ary_size);
327 bzero(ary, ary_size);
331 # Method to sort array of hits according to q_index.
334 VALUE _ary, // Array of hits.
335 VALUE _ary_size // Size of array.
338 hit *ary = (hit *) StringValuePtr(_ary);
339 unsigned int ary_size = FIX2UINT(_ary_size);
341 qsort(ary, ary_size, sizeof(hit), hit_cmp_by_q_index);
345 # Method that given a string (char array) encodes all kmers overlapping
346 # with a given step size as integers that are saved in an unsigned integer
347 # array. Then the array is sorted and uniq'ed and the number of unique
348 # elements is returned.
350 VALUE str_to_oligo_ary_c(
351 VALUE _str, // DNA or RNA string.
352 VALUE _str_size, // String length.
353 VALUE _ary, // Array to store result in.
354 VALUE _kmer, // Size of kmers/oligos.
355 VALUE _step // Step size for overlapping kmers.
358 char *str = StringValuePtr(_str);
359 unsigned int str_size = FIX2UINT(_str_size);
360 unsigned int *ary = (unsigned int *) StringValuePtr(_ary);
361 unsigned int kmer = FIX2UINT(_kmer);
362 unsigned int step = FIX2UINT(_step);
364 unsigned int ary_size = 0;
365 unsigned int mask = (1 << (2 * kmer)) - 1;
366 unsigned int bin = 0;
369 for (i = 0; i < kmer; i++)
372 bin |= oligo_map[str[i]];
375 ary[ary_size++] = bin;
377 for (i = kmer; i < str_size; i++)
380 bin |= oligo_map[str[i]];
382 if ((i % step) == 0) {
383 ary[ary_size++] = (bin & mask);
387 qsort(ary, ary_size, sizeof(unsigned int), uint_cmp);
388 ary_size = uniq_ary(ary, ary_size);
390 return UINT2NUM(ary_size);
394 # Method that given a string (char array) encodes all kmers overlapping
395 # with a given step size as integers that are pushed onto a Ruby array
397 # TODO should have an option for skipping oligos with ambiguity codes.
399 VALUE str_to_oligo_rb_ary_c(
400 VALUE _str, // DNA or RNA string.
401 VALUE _kmer, // Size of kmer or oligo.
402 VALUE _step // Step size for overlapping kmers.
405 char *str = StringValuePtr(_str);
406 unsigned int kmer = FIX2UINT(_kmer);
407 unsigned int step = FIX2UINT(_step);
409 VALUE array = rb_ary_new();
410 long len = strlen(str);
411 unsigned int mask = (1 << (2 * kmer)) - 1;
412 unsigned int bin = 0;
415 for (i = 0; i < kmer; i++)
418 bin |= oligo_map[str[i]];
421 rb_ary_push(array, INT2FIX(bin));
423 for (i = kmer; i < len; i++)
426 bin |= oligo_map[str[i]];
428 if ((i % step) == 0) {
429 rb_ary_push(array, INT2FIX((bin & mask)));
437 # Method that counts all shared oligos/kmers between a subject sequence and
438 # all query sequences. For each oligo in the subject sequence (s_ary) the
439 # index of all query sequences containing this oligo is found for the q_ary
440 # where this information is stored sequentially in intervals. For the
441 # particula oligo the interval is looked up in the q_beg and q_end arrays.
442 # Shared oligos are recorded in the shared_ary.
445 VALUE _q_ary, // Lookup array with q_index lists.
446 VALUE _q_begs_ary, // Lookup array with interval begins of q_index lists.
447 VALUE _q_ends_ary, // Lookup array with interval ends of q_index lists.
448 VALUE _s_ary, // Subject sequence oligo array.
449 VALUE _s_ary_size, // Subject sequence oligo array size.
450 VALUE _shared_ary // Array with shared oligo counts.
453 unsigned int *q_ary = (unsigned int *) StringValuePtr(_q_ary);
454 unsigned int *q_begs_ary = (unsigned int *) StringValuePtr(_q_begs_ary);
455 unsigned int *q_ends_ary = (unsigned int *) StringValuePtr(_q_ends_ary);
456 unsigned int *s_ary = (unsigned int *) StringValuePtr(_s_ary);
457 unsigned int s_ary_size = FIX2UINT(_s_ary_size);
458 unsigned int *shared_ary = (unsigned int *) StringValuePtr(_shared_ary);
460 unsigned int oligo = 0;
463 unsigned int q_beg = 0;
464 unsigned int q_end = 0;
466 for (s = 0; s < s_ary_size; s++) // each oligo do
470 q_beg = q_begs_ary[oligo];
471 q_end = q_ends_ary[oligo];
473 for (q = q_beg; q < q_end; q++)
475 shared_ary[q_ary[q]]++;
481 # Method to calculate the score for all query vs subject sequence comparisons.
482 # The score is calculated as the number of unique shared oligos divided by the
483 # smallest number of unique oligos in either the subject or query sequence.
484 # Only scores greater than or equal to a given minimum is stored in the hit
485 # array and the size of the hit array is returned.
488 VALUE _q_total_ary, // Lookup array with total oligo counts.
489 VALUE _shared_ary, // Lookup arary with shared oligo counts.
490 VALUE _q_size, // Number of query sequences.
491 VALUE _s_count, // Number of unique oligos in subject sequence.
492 VALUE _hit_ary, // Result array.
493 VALUE _hit_ary_size, // Result array size.
494 VALUE _s_index, // Subject sequence index.
495 VALUE _min_score // Minimum score to include.
498 unsigned int *q_total_ary = (unsigned int *) StringValuePtr(_q_total_ary);
499 unsigned int *shared_ary = (unsigned int *) StringValuePtr(_shared_ary);
500 unsigned int q_size = FIX2UINT(_q_size);
501 unsigned int s_count = FIX2UINT(_s_count);
502 hit *hit_ary = (hit *) StringValuePtr(_hit_ary);
503 unsigned int hit_ary_size = FIX2UINT(_hit_ary_size);
504 unsigned int s_index = FIX2UINT(_s_index);
505 float min_score = (float) NUM2DBL(_min_score);
507 unsigned int q_index = 0;
508 unsigned int q_count = 0;
509 unsigned int shared_count = 0;
510 unsigned int total_count = 0;
512 hit new_score = {0, 0, 0};
514 for (q_index = 0; q_index < q_size; q_index++)
516 shared_count = shared_ary[q_index];
517 q_count = q_total_ary[q_index];
518 total_count = (s_count < q_count) ? s_count : q_count;
520 score = shared_count / (float) total_count;
522 if (score >= min_score)
524 new_score.q_index = q_index;
525 new_score.s_index = s_index;
526 new_score.score = score;
528 hit_ary[hit_ary_size] = new_score;
534 return UINT2NUM(hit_ary_size);
538 # Method that fetches all hits matching a given q_index value
539 # from the result_ary. Since the result_ary is sorted according
540 # to q_index we start at a given hit_index and fetch until
541 # the next hit with a different q_index. The fetched hits are
542 # stored in an array which is sorted according to decreasing
543 # score. The size of this array is returned.
546 VALUE _result_ary, // Result array
547 VALUE _result_ary_size, // Result array size
548 VALUE _hit_index, // Offset position in hit_ary to search from.
549 VALUE _hit_ary, // Array to store hits in.
550 VALUE _q_index) // q_index to fetch.
552 hit *result_ary = (hit *) StringValuePtr(_result_ary);
553 unsigned int result_ary_size = FIX2UINT(_result_ary_size);
554 unsigned int hit_index = FIX2UINT(_hit_index);
555 hit *hit_ary = (hit *) StringValuePtr(_hit_ary);
556 unsigned int q_index = FIX2UINT(_q_index);
557 hit new_hit = {0, 0, 0};
558 unsigned int hit_ary_size = 0;
560 while ((hit_index + hit_ary_size) < result_ary_size)
562 new_hit = result_ary[hit_index + hit_ary_size];
564 if (new_hit.q_index == q_index)
566 hit_ary[hit_ary_size] = new_hit;
576 qsort(hit_ary, hit_ary_size, sizeof(hit), hit_cmp_by_score);
578 return UINT2NUM(hit_ary_size);
583 # >>>>>>>>>>>>>>> Embedded classes <<<<<<<<<<<<<<<
585 # Class for holding score information.
587 attr_reader :q_id, :s_id, :score
589 # Method to initialize Hit object with
590 # query and subject id a score.
591 def initialize(q_id, s_id, score)
597 # Method for outputting score objects.
599 "#{@q_id}:#{@s_id}:#{@score.round(2)}"
602 # Method to convert to a Biopiece record.
605 hash[:REC_TYPE] = "findsim"
608 hash[:SCORE] = @score.round(2)