1 # Copyright (C) 2007-2012 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
26 require 'maasha/fasta'
27 require 'maasha/align'
32 RESULT_ARY_MAX = 50_000_000 # Maximum size for the result_ary.
33 HIT_ARY_MAX = 100_000 # Maximum size for the hit_ary.
35 # FindSim is an implementation of the SimRank logic proposed by Niels Larsen.
36 # The purpose is to find similarities between query DNA/RNA sequences and a
37 # database of small subunit ribosomal DNA/RNA sequences. The sequence
38 # similarities are calculated as the number of shared unique oligos/kmers
39 # divided by the smallest number of unique oligoes in either the query or
40 # database sequence. This yields a rough under estimate of similarity e.g. 50%
41 # oligo similarity may correspond to 80% similarity on a nucleotide level
42 # (needs clarification). The outcome of FindSim is a table with a row per
43 # query sequence and the columns are the database hits sorted according to
46 # Extensive use of inline C for speed.
50 # Method to initialize FindSim Object.
51 def initialize(opt_hash)
66 # Method to load sequences from a query file in FASTA format
67 # and index these for oligo based similarty search.
71 oligo_hash = Hash.new { |h, k| h[k] = [] }
75 Fasta.open(file, 'r') do |ios|
77 @q_ids << entry.seq_name if @opt_hash[:query_ids]
78 @q_entries << entry if @opt_hash[:realign]
80 oligos = str_to_oligo_rb_ary_c(entry.seq, @opt_hash[:kmer], 1).uniq.sort
82 q_total << oligos.size
84 oligos.each do |oligo|
85 oligo_hash[oligo] << count
94 create_query_index(q_total, oligo_hash)
96 $stderr.puts "Loaded #{count} query sequences in #{(Time.now - time)} seconds." if @opt_hash[:verbose]
99 # Method to search database or subject sequences from a FASTA file by
100 # locating for each sequence all shared oligos with the query index.
103 oligo_ary = "\0" * (4 ** @opt_hash[:kmer]) * BYTES_IN_INT
104 shared_ary = "\0" * @q_size * BYTES_IN_INT
105 result_ary = "\0" * RESULT_ARY_MAX * BYTES_IN_HIT
108 Fasta.open(file, 'r') do |ios|
109 ios.each_with_index do |entry, s_index|
110 @s_ids << entry.seq_name if @opt_hash[:subject_ids]
111 @s_entries << entry if @opt_hash[:realign]
113 zero_ary_c(oligo_ary, (4 ** @opt_hash[:kmer]) * BYTES_IN_INT)
114 zero_ary_c(shared_ary, @q_size * BYTES_IN_INT)
116 oligo_ary_size = str_to_oligo_ary_c(entry.seq, entry.len, oligo_ary, @opt_hash[:kmer], @opt_hash[:step])
118 count_shared_c(@q_ary, @q_begs_ary, @q_ends_ary, oligo_ary, oligo_ary_size, shared_ary)
120 result_count = calc_scores_c(@q_total_ary, shared_ary, @q_size, oligo_ary_size, result_ary, result_count, s_index, @opt_hash[:min_score])
122 if ((s_index + 1) % 1000) == 0 and @opt_hash[:verbose]
123 $stderr.puts "Searched #{s_index + 1} sequences in #{Time.now - time} seconds (#{result_count} hits)."
128 @result = result_ary[0 ... result_count * BYTES_IN_HIT]
129 @result_count = result_count
132 # Method that for each query index yields all hits, sorted according to
133 # decending score, as Hit objects.
135 sort_hits_c(@result, @result_count)
137 hit_ary = "\0" * HIT_ARY_MAX * BYTES_IN_HIT
141 (0 ... @q_size).each do |q_index|
142 zero_ary_c(hit_ary, HIT_ARY_MAX * BYTES_IN_HIT)
143 hit_ary_size = get_hits_c(@result, @result_count, hit_index, hit_ary, q_index)
145 if @opt_hash[:max_hits]
146 max = (hit_ary_size > @opt_hash[:max_hits]) ? @opt_hash[:max_hits] : hit_ary_size
151 (0 ... max).each do |i|
152 q_index, s_index, score = hit_ary[BYTES_IN_HIT * i ... BYTES_IN_HIT * i + BYTES_IN_HIT].unpack("IIF")
154 q_id = @opt_hash[:query_ids] ? @q_ids[q_index] : q_index
155 s_id = @opt_hash[:subject_ids] ? @s_ids[s_index] : s_index
157 if @opt_hash[:realign]
158 new_score = Align.muscle([@q_entries[q_index], @s_entries[s_index]]).identity
159 score = new_score if new_score > score
162 yield Hit.new(q_id, s_id, score)
165 hit_index += hit_ary_size
173 # Method to create the query index for FindSim search. The index consists of
174 # four lookup arrays which are stored as instance variables:
175 # * @q_total_ary holds per index the total of unique oligos for that
176 # particular query sequences.
177 # * @q_ary holds sequencially lists of query indexes so it is possible
178 # to lookup the list for a particular oligo and get all query indeces for
179 # query sequences containing this oligo.
180 # * @q_begs_ary holds per index the begin coordinate for the @q_ary list for
181 # a particular oligo.
182 # * @q_ends_ary holds per index the end coordinate for the @q_ary list for a
184 def create_query_index(q_total, oligo_hash)
185 @q_total_ary = q_total.pack("I*")
190 oligo_begs = Array.new(4 ** @opt_hash[:kmer], 0)
191 oligo_ends = Array.new(4 ** @opt_hash[:kmer], 0)
193 oligo_hash.each do |oligo, list|
194 @q_ary << list.pack("I*")
195 oligo_begs[oligo] = beg
196 oligo_ends[oligo] = beg + list.size
201 @q_begs_ary = oligo_begs.pack("I*")
202 @q_ends_ary = oligo_ends.pack("I*")
205 # >>>>>>>>>>>>>>> RubyInline C code <<<<<<<<<<<<<<<
208 # Bitmap lookup table where the index is ASCII values
209 # and the following nucleotides are encoded as:
215 unsigned char oligo_map[256] = {
216 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
235 # Defining a hit struct to hold hits consisting of query
236 # sequence index, subject sequence index and score.
240 unsigned int q_index;
241 unsigned int s_index;
246 # Qsort unsigned int comparison function.
247 # Returns negative if b > a and positive if a > b.
249 int uint_cmp(const void *a, const void *b)
251 const unsigned int *ia = (const unsigned int *) a;
252 const unsigned int *ib = (const unsigned int *) b;
258 # Qsort hit struct comparision function of q_index.
259 # Returns negative if b > a and positive if a > b.
261 int hit_cmp_by_q_index(const void *a, const void *b)
266 return (int) (ia->q_index - ib->q_index);
270 # Qsort hit struct comparision function of score.
271 # Returns negative if b < a and positive if a < b.
272 # We multiply with 1000 to maintain 2 decimal floats.
274 int hit_cmp_by_score(const void *a, const void *b)
279 return (int) (1000 * ib->score - 1000 * ia->score);
283 # Given a sorted unsigned integer removes all duplicate values
284 # and move the unique values to the front of the array. Returns
285 # the new size of the array.
287 unsigned int uniq_ary(unsigned int *ary, unsigned int ary_size)
289 unsigned int new_ary_size = 0;
293 for (i = 1; i < ary_size; i++)
295 if (ary[i] != ary[j])
298 ary[j] = ary[i]; // Move it to the front
302 new_ary_size = j + 1;
308 # Method that given a byte array and its size in bytes
309 # sets all bytes to 0.
312 VALUE _ary, // Byte array to zero.
313 VALUE _ary_size // Size of array.
316 char *ary = (char *) StringValuePtr(_ary);
317 unsigned int ary_size = FIX2UINT(_ary_size);
319 bzero(ary, ary_size);
323 # Method to sort array of hits according to q_index.
326 VALUE _ary, // Array of hits.
327 VALUE _ary_size // Size of array.
330 hit *ary = (hit *) StringValuePtr(_ary);
331 unsigned int ary_size = FIX2UINT(_ary_size);
333 qsort(ary, ary_size, sizeof(hit), hit_cmp_by_q_index);
337 # Method that given a string (char array) encodes all kmers overlapping
338 # with a given step size as integers that are saved in an unsigned integer
339 # array. Then the array is sorted and uniq'ed and the number of unique
340 # elements is returned.
342 VALUE str_to_oligo_ary_c(
343 VALUE _str, // DNA or RNA string.
344 VALUE _str_size, // String length.
345 VALUE _ary, // Array to store result in.
346 VALUE _kmer, // Size of kmers/oligos.
347 VALUE _step // Step size for overlapping kmers.
350 unsigned char *str = StringValuePtr(_str);
351 unsigned int str_size = FIX2UINT(_str_size);
352 unsigned int *ary = (unsigned int *) StringValuePtr(_ary);
353 unsigned int kmer = FIX2UINT(_kmer);
354 unsigned int step = FIX2UINT(_step);
356 unsigned int ary_size = 0;
357 unsigned int mask = (1 << (2 * kmer)) - 1;
358 unsigned int bin = 0;
361 for (i = 0; i < kmer; i++)
364 bin |= oligo_map[str[i]];
367 ary[ary_size++] = bin;
369 for (i = kmer; i < str_size; i++)
372 bin |= oligo_map[str[i]];
374 if ((i % step) == 0) {
375 ary[ary_size++] = (bin & mask);
379 qsort(ary, ary_size, sizeof(unsigned int), uint_cmp);
380 ary_size = uniq_ary(ary, ary_size);
382 return UINT2NUM(ary_size);
386 # Method that given a string (char array) encodes all kmers overlapping
387 # with a given step size as integers that are pushed onto a Ruby array
389 # TODO should have an option for skipping oligos with ambiguity codes.
391 VALUE str_to_oligo_rb_ary_c(
392 VALUE _str, // DNA or RNA string.
393 VALUE _kmer, // Size of kmer or oligo.
394 VALUE _step // Step size for overlapping kmers.
397 char *str = StringValuePtr(_str);
398 unsigned int kmer = FIX2UINT(_kmer);
399 unsigned int step = FIX2UINT(_step);
401 VALUE array = rb_ary_new();
402 long len = strlen(str);
403 unsigned int mask = (1 << (2 * kmer)) - 1;
404 unsigned int bin = 0;
407 for (i = 0; i < kmer; i++)
410 bin |= oligo_map[str[i]];
413 rb_ary_push(array, INT2FIX(bin));
415 for (i = kmer; i < len; i++)
418 bin |= oligo_map[str[i]];
420 if ((i % step) == 0) {
421 rb_ary_push(array, INT2FIX((bin & mask)));
429 # Method that counts all shared oligos/kmers between a subject sequence and
430 # all query sequences. For each oligo in the subject sequence (s_ary) the
431 # index of all query sequences containing this oligo is found the the q_ary
432 # where this information is stored sequentially in intervals. For the
433 # particula oligo the interval is looked up in the q_beg and q_end arrays.
434 # Shared oligos are recorded in the shared_ary.
437 VALUE _q_ary, // Lookup array with q_index lists.
438 VALUE _q_begs_ary, // Lookup array with interval begins of q_index lists.
439 VALUE _q_ends_ary, // Lookup array with interval ends of q_index lists.
440 VALUE _s_ary, // Subject sequence oligo array.
441 VALUE _s_ary_size, // Subject sequence oligo array size.
442 VALUE _shared_ary // Array with shared oligo counts.
445 unsigned int *q_ary = (unsigned int *) StringValuePtr(_q_ary);
446 unsigned int *q_begs_ary = (unsigned int *) StringValuePtr(_q_begs_ary);
447 unsigned int *q_ends_ary = (unsigned int *) StringValuePtr(_q_ends_ary);
448 unsigned int *s_ary = (unsigned int *) StringValuePtr(_s_ary);
449 unsigned int s_ary_size = FIX2UINT(_s_ary_size);
450 unsigned int *shared_ary = (unsigned int *) StringValuePtr(_shared_ary);
452 unsigned int oligo = 0;
455 unsigned int q_beg = 0;
456 unsigned int q_end = 0;
458 for (s = 0; s < s_ary_size; s++) // each oligo do
462 q_beg = q_begs_ary[oligo];
463 q_end = q_ends_ary[oligo];
465 for (q = q_beg; q < q_end; q++)
467 shared_ary[q_ary[q]]++;
473 # Method to calculate the score for all query vs subject sequence comparisons.
474 # The score is calculated as the number of unique shared oligos divided by the
475 # smallest number of unique oligos in either the subject or query sequence.
476 # Only scores greater than or equal to a given minimum is stored in the hit
477 # array and the size of the hit array is returned.
480 VALUE _q_total_ary, // Lookup array with total oligo counts.
481 VALUE _shared_ary, // Lookup arary with shared oligo counts.
482 VALUE _q_size, // Number of query sequences.
483 VALUE _s_count, // Number of unique oligos in subject sequence.
484 VALUE _hit_ary, // Result array.
485 VALUE _hit_ary_size, // Result array size.
486 VALUE _s_index, // Subject sequence index.
487 VALUE _min_score // Minimum score to include.
490 unsigned int *q_total_ary = (unsigned int *) StringValuePtr(_q_total_ary);
491 unsigned int *shared_ary = (unsigned int *) StringValuePtr(_shared_ary);
492 unsigned int q_size = FIX2UINT(_q_size);
493 unsigned int s_count = FIX2UINT(_s_count);
494 hit *hit_ary = (hit *) StringValuePtr(_hit_ary);
495 unsigned int hit_ary_size = FIX2UINT(_hit_ary_size);
496 unsigned int s_index = FIX2UINT(_s_index);
497 float min_score = (float) NUM2DBL(_min_score);
499 unsigned int q_index = 0;
500 unsigned int q_count = 0;
501 unsigned int shared_count = 0;
502 unsigned int total_count = 0;
504 hit new_score = {0, 0, 0};
506 for (q_index = 0; q_index < q_size; q_index++)
508 shared_count = shared_ary[q_index];
509 q_count = q_total_ary[q_index];
510 total_count = (s_count < q_count) ? s_count : q_count;
512 score = shared_count / (float) total_count;
514 if (score >= min_score)
516 new_score.q_index = q_index;
517 new_score.s_index = s_index;
518 new_score.score = score;
520 hit_ary[hit_ary_size] = new_score;
526 return UINT2NUM(hit_ary_size);
530 # Method that fetches all hits matching a given q_index value
531 # from the result_ary. Since the result_ary is sorted according
532 # to q_index we start at a given hit_index and fetch until
533 # the next hit with a different q_index. The fetched hits are
534 # stored in an array which is sorted according to decreasing
535 # score. The size of this array is returned.
538 VALUE _result_ary, // Result array
539 VALUE _result_ary_size, // Result array size
540 VALUE _hit_index, // Offset position in hit_ary to search from.
541 VALUE _hit_ary, // Array to store hits in.
542 VALUE _q_index) // q_index to fetch.
544 hit *result_ary = (hit *) StringValuePtr(_result_ary);
545 unsigned int result_ary_size = FIX2UINT(_result_ary_size);
546 unsigned int hit_index = FIX2UINT(_hit_index);
547 hit *hit_ary = (hit *) StringValuePtr(_hit_ary);
548 unsigned int q_index = FIX2UINT(_q_index);
549 hit new_hit = {0, 0, 0};
550 unsigned int hit_ary_size = 0;
552 while ((hit_index + hit_ary_size) < result_ary_size)
554 new_hit = result_ary[hit_index + hit_ary_size];
556 if (new_hit.q_index == q_index)
558 hit_ary[hit_ary_size] = new_hit;
568 qsort(hit_ary, hit_ary_size, sizeof(hit), hit_cmp_by_score);
570 return UINT2NUM(hit_ary_size);
575 # >>>>>>>>>>>>>>> Embedded classes <<<<<<<<<<<<<<<
577 # Class for holding score information.
579 attr_reader :q_id, :s_id, :score
581 # Method to initialize Hit object with
582 # query and subject id a score.
583 def initialize(q_id, s_id, score)
589 # Method for outputting score objects.
591 "#{@q_id}:#{@s_id}:#{@score.round(2)}"
594 # Method to convert to a Biopiece record.
597 hash[:REC_TYPE] = "findsim"
600 hash[:SCORE] = @score.round(2)