1 # Copyright (C) 2007-2012 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
26 require 'maasha/fasta'
27 require 'maasha/align'
31 BYTES_IN_HIT = 2 * BYTES_IN_INT + 1 * BYTES_IN_FLOAT # i.e. 12
32 NUC_ALPH_SIZE = 4 # Alphabet size of nucleotides.
33 RESULT_ARY_MAX = 50_000_000 # Maximum size for the result_ary.
34 HIT_ARY_MAX = 100_000 # Maximum size for the hit_ary.
36 # FindSim is an implementation of the SimRank logic proposed by Niels Larsen.
37 # The purpose is to find similarities between query DNA/RNA sequences and a
38 # database of small subunit ribosomal DNA/RNA sequences. The sequence
39 # similarities are calculated as the number of shared unique oligos/kmers
40 # divided by the smallest number of unique oligoes in either the query or
41 # database sequence. This yields a rough under estimate of similarity e.g. 50%
42 # oligo similarity may correspond to 80% similarity on a nucleotide level
43 # (needs clarification). The outcome of FindSim is a table with a row per
44 # query sequence and the columns are the database hits sorted according to
47 # Extensive use of inline C for speed.
51 # Method to initialize FindSim Object.
52 def initialize(opt_hash)
67 # Method to load sequences from a query file in FASTA format
68 # and index these for oligo based similarty search.
72 oligo_hash = Hash.new { |h, k| h[k] = [] }
76 Fasta.open(file, 'r') do |ios|
78 @q_ids << entry.seq_name if @opt_hash[:query_ids]
79 @q_entries << entry if @opt_hash[:realign]
81 oligos = str_to_oligo_rb_ary_c(entry.seq, @opt_hash[:kmer], 1).uniq.sort
83 q_total << oligos.size
85 oligos.each do |oligo|
86 oligo_hash[oligo] << count
95 create_query_index(q_total, oligo_hash)
97 $stderr.puts "Loaded #{count} query sequences in #{(Time.now - time)} seconds." if @opt_hash[:verbose]
100 # Method to search database or subject sequences from a FASTA file by
101 # locating for each sequence all shared oligos with the query index.
104 oligo_ary = "\0" * (NUC_ALPH_SIZE ** @opt_hash[:kmer]) * BYTES_IN_INT
105 shared_ary = "\0" * @q_size * BYTES_IN_INT
106 result_ary = "\0" * RESULT_ARY_MAX * BYTES_IN_HIT
109 Fasta.open(file, 'r') do |ios|
110 ios.each_with_index do |entry, s_index|
111 @s_ids << entry.seq_name if @opt_hash[:subject_ids]
112 @s_entries << entry if @opt_hash[:realign]
114 zero_ary_c(oligo_ary, (NUC_ALPH_SIZE ** @opt_hash[:kmer]) * BYTES_IN_INT)
115 zero_ary_c(shared_ary, @q_size * BYTES_IN_INT)
117 oligo_ary_size = str_to_oligo_ary_c(entry.seq, entry.len, oligo_ary, @opt_hash[:kmer], @opt_hash[:step])
119 count_shared_c(@q_ary, @q_begs_ary, @q_ends_ary, oligo_ary, oligo_ary_size, shared_ary)
121 result_count = calc_scores_c(@q_total_ary, shared_ary, @q_size, oligo_ary_size, result_ary, result_count, s_index, @opt_hash[:min_score])
123 if ((s_index + 1) % 1000) == 0 and @opt_hash[:verbose]
124 $stderr.puts "Searched #{s_index + 1} sequences in #{Time.now - time} seconds (#{result_count} hits)."
129 @result = result_ary[0 ... result_count * BYTES_IN_HIT]
130 @result_count = result_count
133 # Method that for each query index yields all hits, sorted according to
134 # decending score, as Hit objects.
136 sort_hits_c(@result, @result_count)
138 hit_ary = "\0" * HIT_ARY_MAX * BYTES_IN_HIT
142 (0 ... @q_size).each do |q_index|
143 zero_ary_c(hit_ary, HIT_ARY_MAX * BYTES_IN_HIT)
144 hit_ary_size = get_hits_c(@result, @result_count, hit_index, hit_ary, q_index)
146 if @opt_hash[:max_hits]
147 max = (hit_ary_size > @opt_hash[:max_hits]) ? @opt_hash[:max_hits] : hit_ary_size
152 (0 ... max).each do |i|
153 q_index, s_index, score = hit_ary[BYTES_IN_HIT * i ... BYTES_IN_HIT * i + BYTES_IN_HIT].unpack("IIF")
155 q_id = @opt_hash[:query_ids] ? @q_ids[q_index] : q_index
156 s_id = @opt_hash[:subject_ids] ? @s_ids[s_index] : s_index
158 if @opt_hash[:realign]
159 new_score = Align.muscle([@q_entries[q_index], @s_entries[s_index]]).identity
160 score = new_score if new_score > score
163 if @opt_hash[:max_diversity]
164 best_score = score if i == 0
165 break if best_score - score > @opt_hash[:max_diversity]
168 yield Hit.new(q_id, s_id, score)
171 hit_index += hit_ary_size
179 # Method to create the query index for FindSim search. The index consists of
180 # four lookup arrays which are stored as instance variables:
181 # * @q_total_ary holds per index the total of unique oligos for that
182 # particular query sequences.
183 # * @q_ary holds sequencially lists of query indexes so it is possible
184 # to lookup the list for a particular oligo and get all query indeces for
185 # query sequences containing this oligo.
186 # * @q_begs_ary holds per index the begin coordinate for the @q_ary list for
187 # a particular oligo.
188 # * @q_ends_ary holds per index the end coordinate for the @q_ary list for a
190 def create_query_index(q_total, oligo_hash)
191 @q_total_ary = q_total.pack("I*")
196 oligo_begs = Array.new(NUC_ALPH_SIZE ** @opt_hash[:kmer], 0)
197 oligo_ends = Array.new(NUC_ALPH_SIZE ** @opt_hash[:kmer], 0)
199 oligo_hash.each do |oligo, list|
200 @q_ary << list.pack("I*")
201 oligo_begs[oligo] = beg
202 oligo_ends[oligo] = beg + list.size
207 @q_begs_ary = oligo_begs.pack("I*")
208 @q_ends_ary = oligo_ends.pack("I*")
211 # >>>>>>>>>>>>>>> RubyInline C code <<<<<<<<<<<<<<<
214 # Bitmap lookup table where the index is ASCII values
215 # and the following nucleotides are encoded as:
221 unsigned char oligo_map[256] = {
222 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
241 # Defining a hit struct to hold hits consisting of query
242 # sequence index, subject sequence index and score.
246 unsigned int q_index;
247 unsigned int s_index;
252 # Qsort unsigned int comparison function.
253 # Returns negative if b > a and positive if a > b.
255 int uint_cmp(const void *a, const void *b)
257 const unsigned int *ia = (const unsigned int *) a;
258 const unsigned int *ib = (const unsigned int *) b;
264 # Qsort hit struct comparision function of q_index.
265 # Returns negative if b > a and positive if a > b.
267 int hit_cmp_by_q_index(const void *a, const void *b)
272 return (int) (ia->q_index - ib->q_index);
276 # Qsort hit struct comparision function of score.
277 # Returns negative if b < a and positive if a < b.
278 # We multiply with 1000 to maintain 2 decimal floats.
280 int hit_cmp_by_score(const void *a, const void *b)
285 return (int) (1000 * ib->score - 1000 * ia->score);
289 # Given a sorted unsigned integer removes all duplicate values
290 # and move the unique values to the front of the array. Returns
291 # the new size of the array.
293 unsigned int uniq_ary(unsigned int *ary, unsigned int ary_size)
295 unsigned int new_ary_size = 0;
299 for (i = 1; i < ary_size; i++)
301 if (ary[i] != ary[j])
304 ary[j] = ary[i]; // Move it to the front
308 new_ary_size = j + 1;
314 # Method that given a byte array and its size in bytes
315 # sets all bytes to 0.
318 VALUE _ary, // Byte array to zero.
319 VALUE _ary_size // Size of array.
322 char *ary = (char *) StringValuePtr(_ary);
323 unsigned int ary_size = FIX2UINT(_ary_size);
325 bzero(ary, ary_size);
329 # Method to sort array of hits according to q_index.
332 VALUE _ary, // Array of hits.
333 VALUE _ary_size // Size of array.
336 hit *ary = (hit *) StringValuePtr(_ary);
337 unsigned int ary_size = FIX2UINT(_ary_size);
339 qsort(ary, ary_size, sizeof(hit), hit_cmp_by_q_index);
343 # Method that given a string (char array) encodes all kmers overlapping
344 # with a given step size as integers that are saved in an unsigned integer
345 # array. Then the array is sorted and uniq'ed and the number of unique
346 # elements is returned.
348 VALUE str_to_oligo_ary_c(
349 VALUE _str, // DNA or RNA string.
350 VALUE _str_size, // String length.
351 VALUE _ary, // Array to store result in.
352 VALUE _kmer, // Size of kmers/oligos.
353 VALUE _step // Step size for overlapping kmers.
356 unsigned char *str = StringValuePtr(_str);
357 unsigned int str_size = FIX2UINT(_str_size);
358 unsigned int *ary = (unsigned int *) StringValuePtr(_ary);
359 unsigned int kmer = FIX2UINT(_kmer);
360 unsigned int step = FIX2UINT(_step);
362 unsigned int ary_size = 0;
363 unsigned int mask = (1 << (2 * kmer)) - 1;
364 unsigned int bin = 0;
367 for (i = 0; i < kmer; i++)
370 bin |= oligo_map[str[i]];
373 ary[ary_size++] = bin;
375 for (i = kmer; i < str_size; i++)
378 bin |= oligo_map[str[i]];
380 if ((i % step) == 0) {
381 ary[ary_size++] = (bin & mask);
385 qsort(ary, ary_size, sizeof(unsigned int), uint_cmp);
386 ary_size = uniq_ary(ary, ary_size);
388 return UINT2NUM(ary_size);
392 # Method that given a string (char array) encodes all kmers overlapping
393 # with a given step size as integers that are pushed onto a Ruby array
395 # TODO should have an option for skipping oligos with ambiguity codes.
397 VALUE str_to_oligo_rb_ary_c(
398 VALUE _str, // DNA or RNA string.
399 VALUE _kmer, // Size of kmer or oligo.
400 VALUE _step // Step size for overlapping kmers.
403 char *str = StringValuePtr(_str);
404 unsigned int kmer = FIX2UINT(_kmer);
405 unsigned int step = FIX2UINT(_step);
407 VALUE array = rb_ary_new();
408 long len = strlen(str);
409 unsigned int mask = (1 << (2 * kmer)) - 1;
410 unsigned int bin = 0;
413 for (i = 0; i < kmer; i++)
416 bin |= oligo_map[str[i]];
419 rb_ary_push(array, INT2FIX(bin));
421 for (i = kmer; i < len; i++)
424 bin |= oligo_map[str[i]];
426 if ((i % step) == 0) {
427 rb_ary_push(array, INT2FIX((bin & mask)));
435 # Method that counts all shared oligos/kmers between a subject sequence and
436 # all query sequences. For each oligo in the subject sequence (s_ary) the
437 # index of all query sequences containing this oligo is found the the q_ary
438 # where this information is stored sequentially in intervals. For the
439 # particula oligo the interval is looked up in the q_beg and q_end arrays.
440 # Shared oligos are recorded in the shared_ary.
443 VALUE _q_ary, // Lookup array with q_index lists.
444 VALUE _q_begs_ary, // Lookup array with interval begins of q_index lists.
445 VALUE _q_ends_ary, // Lookup array with interval ends of q_index lists.
446 VALUE _s_ary, // Subject sequence oligo array.
447 VALUE _s_ary_size, // Subject sequence oligo array size.
448 VALUE _shared_ary // Array with shared oligo counts.
451 unsigned int *q_ary = (unsigned int *) StringValuePtr(_q_ary);
452 unsigned int *q_begs_ary = (unsigned int *) StringValuePtr(_q_begs_ary);
453 unsigned int *q_ends_ary = (unsigned int *) StringValuePtr(_q_ends_ary);
454 unsigned int *s_ary = (unsigned int *) StringValuePtr(_s_ary);
455 unsigned int s_ary_size = FIX2UINT(_s_ary_size);
456 unsigned int *shared_ary = (unsigned int *) StringValuePtr(_shared_ary);
458 unsigned int oligo = 0;
461 unsigned int q_beg = 0;
462 unsigned int q_end = 0;
464 for (s = 0; s < s_ary_size; s++) // each oligo do
468 q_beg = q_begs_ary[oligo];
469 q_end = q_ends_ary[oligo];
471 for (q = q_beg; q < q_end; q++)
473 shared_ary[q_ary[q]]++;
479 # Method to calculate the score for all query vs subject sequence comparisons.
480 # The score is calculated as the number of unique shared oligos divided by the
481 # smallest number of unique oligos in either the subject or query sequence.
482 # Only scores greater than or equal to a given minimum is stored in the hit
483 # array and the size of the hit array is returned.
486 VALUE _q_total_ary, // Lookup array with total oligo counts.
487 VALUE _shared_ary, // Lookup arary with shared oligo counts.
488 VALUE _q_size, // Number of query sequences.
489 VALUE _s_count, // Number of unique oligos in subject sequence.
490 VALUE _hit_ary, // Result array.
491 VALUE _hit_ary_size, // Result array size.
492 VALUE _s_index, // Subject sequence index.
493 VALUE _min_score // Minimum score to include.
496 unsigned int *q_total_ary = (unsigned int *) StringValuePtr(_q_total_ary);
497 unsigned int *shared_ary = (unsigned int *) StringValuePtr(_shared_ary);
498 unsigned int q_size = FIX2UINT(_q_size);
499 unsigned int s_count = FIX2UINT(_s_count);
500 hit *hit_ary = (hit *) StringValuePtr(_hit_ary);
501 unsigned int hit_ary_size = FIX2UINT(_hit_ary_size);
502 unsigned int s_index = FIX2UINT(_s_index);
503 float min_score = (float) NUM2DBL(_min_score);
505 unsigned int q_index = 0;
506 unsigned int q_count = 0;
507 unsigned int shared_count = 0;
508 unsigned int total_count = 0;
510 hit new_score = {0, 0, 0};
512 for (q_index = 0; q_index < q_size; q_index++)
514 shared_count = shared_ary[q_index];
515 q_count = q_total_ary[q_index];
516 total_count = (s_count < q_count) ? s_count : q_count;
518 score = shared_count / (float) total_count;
520 if (score >= min_score)
522 new_score.q_index = q_index;
523 new_score.s_index = s_index;
524 new_score.score = score;
526 hit_ary[hit_ary_size] = new_score;
532 return UINT2NUM(hit_ary_size);
536 # Method that fetches all hits matching a given q_index value
537 # from the result_ary. Since the result_ary is sorted according
538 # to q_index we start at a given hit_index and fetch until
539 # the next hit with a different q_index. The fetched hits are
540 # stored in an array which is sorted according to decreasing
541 # score. The size of this array is returned.
544 VALUE _result_ary, // Result array
545 VALUE _result_ary_size, // Result array size
546 VALUE _hit_index, // Offset position in hit_ary to search from.
547 VALUE _hit_ary, // Array to store hits in.
548 VALUE _q_index) // q_index to fetch.
550 hit *result_ary = (hit *) StringValuePtr(_result_ary);
551 unsigned int result_ary_size = FIX2UINT(_result_ary_size);
552 unsigned int hit_index = FIX2UINT(_hit_index);
553 hit *hit_ary = (hit *) StringValuePtr(_hit_ary);
554 unsigned int q_index = FIX2UINT(_q_index);
555 hit new_hit = {0, 0, 0};
556 unsigned int hit_ary_size = 0;
558 while ((hit_index + hit_ary_size) < result_ary_size)
560 new_hit = result_ary[hit_index + hit_ary_size];
562 if (new_hit.q_index == q_index)
564 hit_ary[hit_ary_size] = new_hit;
574 qsort(hit_ary, hit_ary_size, sizeof(hit), hit_cmp_by_score);
576 return UINT2NUM(hit_ary_size);
581 # >>>>>>>>>>>>>>> Embedded classes <<<<<<<<<<<<<<<
583 # Class for holding score information.
585 attr_reader :q_id, :s_id, :score
587 # Method to initialize Hit object with
588 # query and subject id a score.
589 def initialize(q_id, s_id, score)
595 # Method for outputting score objects.
597 "#{@q_id}:#{@s_id}:#{@score.round(2)}"
600 # Method to convert to a Biopiece record.
603 hash[:REC_TYPE] = "findsim"
606 hash[:SCORE] = @score.round(2)