1 # Copyright (C) 2007-2012 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
26 require 'maasha/fasta'
31 RESULT_ARY_MAX = 50_000_000 # Maximum size for the result_ary.
32 HIT_ARY_MAX = 100_000 # Maximum size for the hit_ary.
34 # FindSim is an implementation of the SimRank logic proposed by Niels Larsen.
35 # The purpose is to find similarities between query DNA/RNA sequences and a
36 # database of small subunit ribosomal DNA/RNA sequences. The sequence
37 # similarities are calculated as the number of shared unique oligos/kmers
38 # divided by the smallest number of unique oligoes in either the query or
39 # database sequence. This yields a rough under estimate of similarity e.g. 50%
40 # oligo similarity may correspond to 80% similarity on a nucleotide level
41 # (needs clarification). The outcome of FindSim is a table with a row per
42 # query sequence and the columns are the database hits sorted according to
45 # Extensive use of inline C for speed.
49 # Method to initialize FindSim Object.
50 def initialize(opt_hash)
63 # Method to load sequences from a query file in FASTA format
64 # and index these for oligo based similarty search.
68 oligo_hash = Hash.new { |h, k| h[k] = [] }
72 Fasta.open(file, 'r') do |ios|
74 @q_ids << entry.seq_name if @opt_hash[:query_ids]
76 oligos = str_to_oligo_rb_ary_c(entry.seq, @opt_hash[:kmer], 1).uniq.sort
78 q_total << oligos.size
80 oligos.each do |oligo|
81 oligo_hash[oligo] << count
90 create_query_index(q_total, oligo_hash)
92 $stderr.puts "Loaded #{count} query sequences in #{(Time.now - time)} seconds." if @opt_hash[:verbose]
95 # Method to search database or subject sequences from a FASTA file by
96 # locating for each sequence all shared oligos with the query index.
99 oligo_ary = "\0" * (4 ** @opt_hash[:kmer]) * BYTES_IN_INT
100 shared_ary = "\0" * @q_size * BYTES_IN_INT
101 result_ary = "\0" * RESULT_ARY_MAX * BYTES_IN_HIT
104 Fasta.open(file, 'r') do |ios|
105 ios.each_with_index do |entry, s_index|
106 @s_ids << entry.seq_name if @opt_hash[:subject_ids]
108 zero_ary_c(oligo_ary, (4 ** @opt_hash[:kmer]) * BYTES_IN_INT)
109 zero_ary_c(shared_ary, @q_size * BYTES_IN_INT)
111 oligo_ary_size = str_to_oligo_ary_c(entry.seq, entry.len, oligo_ary, @opt_hash[:kmer], @opt_hash[:step])
113 count_shared_c(@q_ary, @q_begs_ary, @q_ends_ary, oligo_ary, oligo_ary_size, shared_ary)
115 result_count = calc_scores_c(@q_total_ary, shared_ary, @q_size, oligo_ary_size, result_ary, result_count, s_index, @opt_hash[:min_score])
117 if ((s_index + 1) % 1000) == 0 and @opt_hash[:verbose]
118 $stderr.puts "Searched #{s_index + 1} sequences in #{Time.now - time} seconds (#{result_count} hits)."
123 @result = result_ary[0 ... result_count * BYTES_IN_HIT]
124 @result_count = result_count
127 # Method that for each query index yields all hits, sorted according to
128 # decending score, as Hit objects.
130 sort_hits_c(@result, @result_count)
132 hit_ary = "\0" * HIT_ARY_MAX * BYTES_IN_HIT
136 (0 ... @q_size).each do |q_index|
137 zero_ary_c(hit_ary, HIT_ARY_MAX * BYTES_IN_HIT)
138 hit_ary_size = get_hits_c(@result, @result_count, hit_index, hit_ary, q_index)
140 if @opt_hash[:report_scores]
141 max = (hit_ary_size > @opt_hash[:report_scores]) ? @opt_hash[:report_scores] : hit_ary_size
146 (0 ... max).each do |i|
147 q_index, s_index, score = hit_ary[BYTES_IN_HIT * i ... BYTES_IN_HIT * i + BYTES_IN_HIT].unpack("IIF")
149 q_id = @opt_hash[:query_ids] ? @q_ids[q_index] : q_index
150 s_id = @opt_hash[:subject_ids] ? @s_ids[s_index] : s_index
152 yield Hit.new(q_id, s_id, score)
155 hit_index += hit_ary_size
163 # Method to create the query index for FindSim search. The index consists of
164 # four lookup arrays which are stored as instance variables:
165 # * @q_total_ary holds per index the total of unique oligos for that
166 # particular query sequences.
167 # * @q_ary holds sequencially lists of query indexes so it is possible
168 # to lookup the list for a particular oligo and get all query indeces for
169 # query sequences containing this oligo.
170 # * @q_begs_ary holds per index the begin coordinate for the @q_ary list for
171 # a particular oligo.
172 # * @q_ends_ary holds per index the end coordinate for the @q_ary list for a
174 def create_query_index(q_total, oligo_hash)
175 @q_total_ary = q_total.pack("I*")
180 oligo_begs = Array.new(4 ** @opt_hash[:kmer], 0)
181 oligo_ends = Array.new(4 ** @opt_hash[:kmer], 0)
183 oligo_hash.each do |oligo, list|
184 @q_ary << list.pack("I*")
185 oligo_begs[oligo] = beg
186 oligo_ends[oligo] = beg + list.size
191 @q_begs_ary = oligo_begs.pack("I*")
192 @q_ends_ary = oligo_ends.pack("I*")
195 # >>>>>>>>>>>>>>> RubyInline C code <<<<<<<<<<<<<<<
198 # Bitmap lookup table where the index is ASCII values
199 # and the following nucleotides are encoded as:
205 unsigned char oligo_map[256] = {
206 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
225 # Defining a hit struct to hold hits consisting of query
226 # sequence index, subject sequence index and score.
230 unsigned int q_index;
231 unsigned int s_index;
236 # Qsort unsigned int comparison function.
237 # Returns negative if b > a and positive if a > b.
239 int uint_cmp(const void *a, const void *b)
241 const unsigned int *ia = (const unsigned int *) a;
242 const unsigned int *ib = (const unsigned int *) b;
248 # Qsort hit struct comparision function of q_index.
249 # Returns negative if b > a and positive if a > b.
251 int hit_cmp_by_q_index(const void *a, const void *b)
256 return (int) (ia->q_index - ib->q_index);
260 # Qsort hit struct comparision function of score.
261 # Returns negative if b < a and positive if a < b.
262 # We multiply with 1000 to maintain 2 decimal floats.
264 int hit_cmp_by_score(const void *a, const void *b)
269 return (int) (1000 * ib->score - 1000 * ia->score);
273 # Given a sorted unsigned integer removes all duplicate values
274 # and move the unique values to the front of the array. Returns
275 # the new size of the array.
277 unsigned int uniq_ary(unsigned int *ary, unsigned int ary_size)
279 unsigned int new_ary_size = 0;
283 for (i = 1; i < ary_size; i++)
285 if (ary[i] != ary[j])
288 ary[j] = ary[i]; // Move it to the front
292 new_ary_size = j + 1;
298 # Method that given a byte array and its size in bytes
299 # sets all bytes to 0.
302 VALUE _ary, // Byte array to zero.
303 VALUE _ary_size // Size of array.
306 char *ary = (char *) StringValuePtr(_ary);
307 unsigned int ary_size = FIX2UINT(_ary_size);
309 bzero(ary, ary_size);
313 # Method to sort array of hits according to q_index.
316 VALUE _ary, // Array of hits.
317 VALUE _ary_size // Size of array.
320 hit *ary = (hit *) StringValuePtr(_ary);
321 unsigned int ary_size = FIX2UINT(_ary_size);
323 qsort(ary, ary_size, sizeof(hit), hit_cmp_by_q_index);
327 # Method that given a string (char array) encodes all kmers overlapping
328 # with a given step size as integers that are saved in an unsigned integer
329 # array. Then the array is sorted and uniq'ed and the number of unique
330 # elements is returned.
332 VALUE str_to_oligo_ary_c(
333 VALUE _str, // DNA or RNA string.
334 VALUE _str_size, // String length.
335 VALUE _ary, // Array to store result in.
336 VALUE _kmer, // Size of kmers/oligos.
337 VALUE _step // Step size for overlapping kmers.
340 unsigned char *str = StringValuePtr(_str);
341 unsigned int str_size = FIX2UINT(_str_size);
342 unsigned int *ary = (unsigned int *) StringValuePtr(_ary);
343 unsigned int kmer = FIX2UINT(_kmer);
344 unsigned int step = FIX2UINT(_step);
346 unsigned int ary_size = 0;
347 unsigned int mask = (1 << (2 * kmer)) - 1;
348 unsigned int bin = 0;
351 for (i = 0; i < kmer; i++)
354 bin |= oligo_map[str[i]];
357 ary[ary_size++] = bin;
359 for (i = kmer; i < str_size; i++)
362 bin |= oligo_map[str[i]];
364 if ((i % step) == 0) {
365 ary[ary_size++] = (bin & mask);
369 qsort(ary, ary_size, sizeof(unsigned int), uint_cmp);
370 ary_size = uniq_ary(ary, ary_size);
372 return UINT2NUM(ary_size);
376 # Method that given a string (char array) encodes all kmers overlapping
377 # with a given step size as integers that are pushed onto a Ruby array
379 # TODO should have an option for skipping oligos with ambiguity codes.
381 VALUE str_to_oligo_rb_ary_c(
382 VALUE _str, // DNA or RNA string.
383 VALUE _kmer, // Size of kmer or oligo.
384 VALUE _step // Step size for overlapping kmers.
387 char *str = StringValuePtr(_str);
388 unsigned int kmer = FIX2UINT(_kmer);
389 unsigned int step = FIX2UINT(_step);
391 VALUE array = rb_ary_new();
392 long len = strlen(str);
393 unsigned int mask = (1 << (2 * kmer)) - 1;
394 unsigned int bin = 0;
397 for (i = 0; i < kmer; i++)
400 bin |= oligo_map[str[i]];
403 rb_ary_push(array, INT2FIX(bin));
405 for (i = kmer; i < len; i++)
408 bin |= oligo_map[str[i]];
410 if ((i % step) == 0) {
411 rb_ary_push(array, INT2FIX((bin & mask)));
419 # Method that counts all shared oligos/kmers between a subject sequence and
420 # all query sequences. For each oligo in the subject sequence (s_ary) the
421 # index of all query sequences containing this oligo is found the the q_ary
422 # where this information is stored sequentially in intervals. For the
423 # particula oligo the interval is looked up in the q_beg and q_end arrays.
424 # Shared oligos are recorded in the shared_ary.
427 VALUE _q_ary, // Lookup array with q_index lists.
428 VALUE _q_begs_ary, // Lookup array with interval begins of q_index lists.
429 VALUE _q_ends_ary, // Lookup array with interval ends of q_index lists.
430 VALUE _s_ary, // Subject sequence oligo array.
431 VALUE _s_ary_size, // Subject sequence oligo array size.
432 VALUE _shared_ary // Array with shared oligo counts.
435 unsigned int *q_ary = (unsigned int *) StringValuePtr(_q_ary);
436 unsigned int *q_begs_ary = (unsigned int *) StringValuePtr(_q_begs_ary);
437 unsigned int *q_ends_ary = (unsigned int *) StringValuePtr(_q_ends_ary);
438 unsigned int *s_ary = (unsigned int *) StringValuePtr(_s_ary);
439 unsigned int s_ary_size = FIX2UINT(_s_ary_size);
440 unsigned int *shared_ary = (unsigned int *) StringValuePtr(_shared_ary);
442 unsigned int oligo = 0;
445 unsigned int q_beg = 0;
446 unsigned int q_end = 0;
448 for (s = 0; s < s_ary_size; s++) // each oligo do
452 q_beg = q_begs_ary[oligo];
453 q_end = q_ends_ary[oligo];
455 for (q = q_beg; q < q_end; q++)
457 shared_ary[q_ary[q]]++;
463 # Method to calculate the score for all query vs subject sequence comparisons.
464 # The score is calculated as the number of unique shared oligos divided by the
465 # smallest number of unique oligos in either the subject or query sequence.
466 # Only scores greater than or equal to a given minimum is stored in the hit
467 # array and the size of the hit array is returned.
470 VALUE _q_total_ary, // Lookup array with total oligo counts.
471 VALUE _shared_ary, // Lookup arary with shared oligo counts.
472 VALUE _q_size, // Number of query sequences.
473 VALUE _s_count, // Number of unique oligos in subject sequence.
474 VALUE _hit_ary, // Result array.
475 VALUE _hit_ary_size, // Result array size.
476 VALUE _s_index, // Subject sequence index.
477 VALUE _min_score // Minimum score to include.
480 unsigned int *q_total_ary = (unsigned int *) StringValuePtr(_q_total_ary);
481 unsigned int *shared_ary = (unsigned int *) StringValuePtr(_shared_ary);
482 unsigned int q_size = FIX2UINT(_q_size);
483 unsigned int s_count = FIX2UINT(_s_count);
484 hit *hit_ary = (hit *) StringValuePtr(_hit_ary);
485 unsigned int hit_ary_size = FIX2UINT(_hit_ary_size);
486 unsigned int s_index = FIX2UINT(_s_index);
487 float min_score = (float) NUM2DBL(_min_score);
489 unsigned int q_index = 0;
490 unsigned int q_count = 0;
491 unsigned int shared_count = 0;
492 unsigned int total_count = 0;
494 hit new_score = {0, 0, 0};
496 for (q_index = 0; q_index < q_size; q_index++)
498 shared_count = shared_ary[q_index];
499 q_count = q_total_ary[q_index];
500 total_count = (s_count < q_count) ? s_count : q_count;
502 score = shared_count / (float) total_count;
504 if (score >= min_score)
506 new_score.q_index = q_index;
507 new_score.s_index = s_index;
508 new_score.score = score;
510 hit_ary[hit_ary_size] = new_score;
516 return UINT2NUM(hit_ary_size);
520 # Method that fetches all hits matching a given q_index value
521 # from the result_ary. Since the result_ary is sorted according
522 # to q_index we start at a given hit_index and fetch until
523 # the next hit with a different q_index. The fetched hits are
524 # stored in an array which is sorted according to decreasing
525 # score. The size of this array is returned.
528 VALUE _result_ary, // Result array
529 VALUE _result_ary_size, // Result array size
530 VALUE _hit_index, // Offset position in hit_ary to search from.
531 VALUE _hit_ary, // Array to store hits in.
532 VALUE _q_index) // q_index to fetch.
534 hit *result_ary = (hit *) StringValuePtr(_result_ary);
535 unsigned int result_ary_size = FIX2UINT(_result_ary_size);
536 unsigned int hit_index = FIX2UINT(_hit_index);
537 hit *hit_ary = (hit *) StringValuePtr(_hit_ary);
538 unsigned int q_index = FIX2UINT(_q_index);
539 hit new_hit = {0, 0, 0};
540 unsigned int hit_ary_size = 0;
542 while ((hit_index + hit_ary_size) < result_ary_size)
544 new_hit = result_ary[hit_index + hit_ary_size];
546 if (new_hit.q_index == q_index)
548 hit_ary[hit_ary_size] = new_hit;
558 qsort(hit_ary, hit_ary_size, sizeof(hit), hit_cmp_by_score);
560 return UINT2NUM(hit_ary_size);
565 # >>>>>>>>>>>>>>> Embedded classes <<<<<<<<<<<<<<<
567 # Class for holding score information.
569 attr_reader :q_id, :s_id, :score
571 # Method to initialize Hit object with
572 # query and subject id a score.
573 def initialize(q_id, s_id, score)
579 # Method for outputting score objects.
581 "#{@q_id}:#{@s_id}:#{@score.round(2)}"