From: martinahansen Date: Tue, 17 Apr 2012 10:09:56 +0000 (+0000) Subject: added findsim.rb X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=8291fe76b04557babbb1b151ac9a0d035253f7f8;p=biopieces.git added findsim.rb git-svn-id: http://biopieces.googlecode.com/svn/trunk@1794 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/code_ruby/lib/maasha/findsim.rb b/code_ruby/lib/maasha/findsim.rb new file mode 100644 index 0000000..5a14065 --- /dev/null +++ b/code_ruby/lib/maasha/findsim.rb @@ -0,0 +1,574 @@ +# Copyright (C) 2007-2012 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This software is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +require 'inline' +require 'maasha/fasta' + +BYTES_IN_INT = 4 +BYTES_IN_FLOAT = 4 +BYTES_IN_HIT = 12 +RESULT_ARY_MAX = 50_000_000 # Maximum size for the result_ary. +HIT_ARY_MAX = 100_000 # Maximum size for the hit_ary. + +# FindSim is an implementation of the SimRank logic proposed by Niels Larsen. +# The purpose is to find similarities between query DNA/RNA sequences and a +# database of small subunit ribosomal DNA/RNA sequences. The sequence +# similarities are calculated as the number of shared unique oligos/kmers +# divided by the smallest number of unique oligoes in either the query or +# database sequence. This yields a rough under estimate of similarity e.g. 50% +# oligo similarity may correspond to 80% similarity on a nucleotide level +# (needs clarification). The outcome of FindSim is a table with a row per +# query sequence and the columns are the database hits sorted according to +# similarity. +# +# Extensive use of inline C for speed. +class FindSim + include Enumerable + + # Method to initialize FindSim Object. + def initialize(opt_hash) + @opt_hash = opt_hash + @q_size = 0 + @q_ary = nil + @q_begs_ary = nil + @q_ends_ary = nil + @q_total_ary = nil + @result = nil + @result_count = 0 + end + + # Method to load sequences from a query file in FASTA format + # and index these for oligo based similarty search. + def load_query(file) + time = Time.now + q_total = [] + oligo_hash = Hash.new { |h, k| h[k] = [] } + + count = 0 + + Fasta.open(file, 'r') do |ios| + ios.each do |entry| + oligos = str_to_oligo_rb_ary_c(entry.seq, @opt_hash[:kmer], 1).uniq.sort + + q_total << oligos.size + + oligos.each do |oligo| + oligo_hash[oligo] << count + end + + count += 1 + end + end + + @q_size = count + + create_query_index(q_total, oligo_hash) + + $stderr.puts "Loaded #{count} query sequences in #{(Time.now - time)} seconds." if @opt_hash[:verbose] + end + + # Method to search database or subject sequences from a FASTA file by + # locating for each sequence all shared oligos with the query index. + def search_db(file) + time = Time.now + oligo_ary = "\0" * (4 ** @opt_hash[:kmer]) * BYTES_IN_INT + shared_ary = "\0" * @q_size * BYTES_IN_INT + result_ary = "\0" * RESULT_ARY_MAX * BYTES_IN_HIT + result_count = 0 + + Fasta.open(file, 'r') do |ios| + ios.each_with_index do |entry, s_index| + zero_ary_c(oligo_ary, (4 ** @opt_hash[:kmer]) * BYTES_IN_INT) + zero_ary_c(shared_ary, @q_size * BYTES_IN_INT) + + oligo_ary_size = str_to_oligo_ary_c(entry.seq, entry.len, oligo_ary, @opt_hash[:kmer], @opt_hash[:step]) + + count_shared_c(@q_ary, @q_begs_ary, @q_ends_ary, oligo_ary, oligo_ary_size, shared_ary) + + result_count = calc_scores_c(@q_total_ary, shared_ary, @q_size, oligo_ary_size, result_ary, result_count, s_index, @opt_hash[:min_score]) + + if ((s_index + 1) % 1000) == 0 and @opt_hash[:verbose] + $stderr.puts "Searched #{s_index + 1} sequences in #{Time.now - time} seconds (#{result_count} hits)." + end + end + end + + @result = result_ary[0 ... result_count * BYTES_IN_HIT] + @result_count = result_count + end + + # Method that for each query index yields all hits, sorted according to + # decending score, as a list of Score objects. + def each + sort_hits_c(@result, @result_count) + + hit_ary = "\0" * HIT_ARY_MAX * BYTES_IN_HIT + + hit_index = 0 + + (0 ... @q_size).each do |q_index| + scores = [] + zero_ary_c(hit_ary, HIT_ARY_MAX * BYTES_IN_HIT) + hit_ary_size = get_hits_c(@result, @result_count, hit_index, hit_ary, q_index) + + max = (hit_ary_size > @opt_hash[:report_scores]) ? @opt_hash[:report_scores] : hit_ary_size + + (0 ... max).each do |i| + q_index, s_index, score = hit_ary[BYTES_IN_HIT * i ... BYTES_IN_HIT * i + BYTES_IN_HIT].unpack("IIF") + + scores << Score.new(score, s_index) + end + + yield scores + + hit_index += hit_ary_size + end + + self + end + + private + + # Method to create the query index for FindSim search. The index consists of + # four lookup arrays which are stored as instance variables: + # * @q_total_ary holds per index the total of unique oligos for that + # particular query sequences. + # * @q_ary holds sequencially lists of query indexes so it is possible + # to lookup the list for a particular oligo and get all query indeces for + # query sequences containing this oligo. + # * @q_begs_ary holds per index the begin coordinate for the @q_ary list for + # a particular oligo. + # * @q_ends_ary holds per index the end coordinate for the @q_ary list for a + # particular oligo. + def create_query_index(q_total, oligo_hash) + @q_total_ary = q_total.pack("I*") + + @q_ary = "" + + beg = 0 + oligo_begs = Array.new(4 ** @opt_hash[:kmer], 0) + oligo_ends = Array.new(4 ** @opt_hash[:kmer], 0) + + oligo_hash.each do |oligo, list| + @q_ary << list.pack("I*") + oligo_begs[oligo] = beg + oligo_ends[oligo] = beg + list.size + + beg += list.size + end + + @q_begs_ary = oligo_begs.pack("I*") + @q_ends_ary = oligo_ends.pack("I*") + end + + # >>>>>>>>>>>>>>> RubyInline C code <<<<<<<<<<<<<<< + + inline do |builder| + # Bitmap lookup table where the index is ASCII values + # and the following nucleotides are encoded as: + # T or t = 1 + # U or u = 1 + # C or c = 2 + # G or g = 3 + builder.prefix %{ + unsigned char oligo_map[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }; + } + + # Defining a hit struct to hold hits consisting of query + # sequence index, subject sequence index and score. + builder.prefix %{ + typedef struct + { + unsigned int q_index; + unsigned int s_index; + float score; + } hit; + } + + # Qsort unsigned int comparison function. + # Returns negative if b > a and positive if a > b. + builder.prefix %{ + int uint_cmp(const void *a, const void *b) + { + const unsigned int *ia = (const unsigned int *) a; + const unsigned int *ib = (const unsigned int *) b; + + return *ia - *ib; + } + } + + # Qsort hit struct comparision function of q_index. + # Returns negative if b > a and positive if a > b. + builder.prefix %{ + int hit_cmp_by_q_index(const void *a, const void *b) + { + hit *ia = (hit *) a; + hit *ib = (hit *) b; + + return (int) (ia->q_index - ib->q_index); + } + } + + # Qsort hit struct comparision function of score. + # Returns negative if b < a and positive if a < b. + # We multiply with 1000 to maintain 2 decimal floats. + builder.prefix %{ + int hit_cmp_by_score(const void *a, const void *b) + { + hit *ia = (hit *) a; + hit *ib = (hit *) b; + + return (int) (1000 * ib->score - 1000 * ia->score); + } + } + + # Given a sorted unsigned integer removes all duplicate values + # and move the unique values to the front of the array. Returns + # the new size of the array. + builder.prefix %{ + unsigned int uniq_ary(unsigned int *ary, unsigned int ary_size) + { + unsigned int new_ary_size = 0; + unsigned int i = 0; + unsigned int j = 0; + + for (i = 1; i < ary_size; i++) + { + if (ary[i] != ary[j]) + { + j++; + ary[j] = ary[i]; // Move it to the front + } + } + + new_ary_size = j + 1; + + return new_ary_size; + } + } + + # Method that given a byte array and its size in bytes + # sets all bytes to 0. + builder.c %{ + void zero_ary_c( + VALUE _ary, // Byte array to zero. + VALUE _ary_size // Size of array. + ) + { + char *ary = (char *) StringValuePtr(_ary); + unsigned int ary_size = FIX2UINT(_ary_size); + + bzero(ary, ary_size); + } + } + + # Method to sort array of hits according to q_index. + builder.c %{ + void sort_hits_c( + VALUE _ary, // Array of hits. + VALUE _ary_size // Size of array. + ) + { + hit *ary = (hit *) StringValuePtr(_ary); + unsigned int ary_size = FIX2UINT(_ary_size); + + qsort(ary, ary_size, sizeof(hit), hit_cmp_by_q_index); + } + } + + # Method that given a string (char array) encodes all kmers overlapping + # with a given step size as integers that are saved in an unsigned integer + # array. Then the array is sorted and uniq'ed and the number of unique + # elements is returned. + builder.c %{ + VALUE str_to_oligo_ary_c( + VALUE _str, // DNA or RNA string. + VALUE _str_size, // String length. + VALUE _ary, // Array to store result in. + VALUE _kmer, // Size of kmers/oligos. + VALUE _step // Step size for overlapping kmers. + ) + { + unsigned char *str = StringValuePtr(_str); + unsigned int str_size = FIX2UINT(_str_size); + unsigned int *ary = (unsigned int *) StringValuePtr(_ary); + unsigned int kmer = FIX2UINT(_kmer); + unsigned int step = FIX2UINT(_step); + + unsigned int ary_size = 0; + unsigned int mask = (1 << (2 * kmer)) - 1; + unsigned int bin = 0; + unsigned int i = 0; + + for (i = 0; i < kmer; i++) + { + bin <<= 2; + bin |= oligo_map[str[i]]; + } + + ary[ary_size++] = bin; + + for (i = kmer; i < str_size; i++) + { + bin <<= 2; + bin |= oligo_map[str[i]]; + + if ((i % step) == 0) { + ary[ary_size++] = (bin & mask); + } + } + + qsort(ary, ary_size, sizeof(unsigned int), uint_cmp); + ary_size = uniq_ary(ary, ary_size); + + return UINT2NUM(ary_size); + } + } + + # Method that given a string (char array) encodes all kmers overlapping + # with a given step size as integers that are pushed onto a Ruby array + # which is returned. + builder.c %{ + VALUE str_to_oligo_rb_ary_c( + VALUE _str, // DNA or RNA string. + VALUE _kmer, // Size of kmer or oligo. + VALUE _step // Step size for overlapping kmers. + ) + { + char *str = StringValuePtr(_str); + unsigned int kmer = FIX2UINT(_kmer); + unsigned int step = FIX2UINT(_step); + + VALUE array = rb_ary_new(); + long len = strlen(str); + unsigned int mask = (1 << (2 * kmer)) - 1; + unsigned int bin = 0; + unsigned int i = 0; + + for (i = 0; i < kmer; i++) + { + bin <<= 2; + bin |= oligo_map[str[i]]; + } + + rb_ary_push(array, INT2FIX(bin)); + + for (i = kmer; i < len; i++) + { + bin <<= 2; + bin |= oligo_map[str[i]]; + + if ((i % step) == 0) { + rb_ary_push(array, INT2FIX((bin & mask))); + } + } + + return array; + } + } + + # Method that counts all shared oligos/kmers between a subject sequence and + # all query sequences. For each oligo in the subject sequence (s_ary) the + # index of all query sequences containing this oligo is found the the q_ary + # where this information is stored sequentially in intervals. For the + # particula oligo the interval is looked up in the q_beg and q_end arrays. + # Shared oligos are recorded in the shared_ary. + builder.c %{ + void count_shared_c( + VALUE _q_ary, // Lookup array with q_index lists. + VALUE _q_begs_ary, // Lookup array with interval begins of q_index lists. + VALUE _q_ends_ary, // Lookup array with interval ends of q_index lists. + VALUE _s_ary, // Subject sequence oligo array. + VALUE _s_ary_size, // Subject sequence oligo array size. + VALUE _shared_ary // Array with shared oligo counts. + ) + { + unsigned int *q_ary = (unsigned int *) StringValuePtr(_q_ary); + unsigned int *q_begs_ary = (unsigned int *) StringValuePtr(_q_begs_ary); + unsigned int *q_ends_ary = (unsigned int *) StringValuePtr(_q_ends_ary); + unsigned int *s_ary = (unsigned int *) StringValuePtr(_s_ary); + unsigned int s_ary_size = FIX2UINT(_s_ary_size); + unsigned int *shared_ary = (unsigned int *) StringValuePtr(_shared_ary); + + unsigned int oligo = 0; + unsigned int s = 0; + unsigned int q = 0; + unsigned int q_beg = 0; + unsigned int q_end = 0; + + for (s = 0; s < s_ary_size; s++) // each oligo do + { + oligo = s_ary[s]; + + q_beg = q_begs_ary[oligo]; + q_end = q_ends_ary[oligo]; + + for (q = q_beg; q < q_end; q++) + { + shared_ary[q_ary[q]]++; + } + } + } + } + + # Method to calculate the score for all query vs subject sequence comparisons. + # The score is calculated as the number of unique shared oligos divided by the + # smallest number of unique oligos in either the subject or query sequence. + # Only scores greater than or equal to a given minimum is stored in the hit + # array and the size of the hit array is returned. + builder.c %{ + VALUE calc_scores_c( + VALUE _q_total_ary, // Lookup array with total oligo counts. + VALUE _shared_ary, // Lookup arary with shared oligo counts. + VALUE _q_size, // Number of query sequences. + VALUE _s_count, // Number of unique oligos in subject sequence. + VALUE _hit_ary, // Result array. + VALUE _hit_ary_size, // Result array size. + VALUE _s_index, // Subject sequence index. + VALUE _min_score // Minimum score to include. + ) + { + unsigned int *q_total_ary = (unsigned int *) StringValuePtr(_q_total_ary); + unsigned int *shared_ary = (unsigned int *) StringValuePtr(_shared_ary); + unsigned int q_size = FIX2UINT(_q_size); + unsigned int s_count = FIX2UINT(_s_count); + hit *hit_ary = (hit *) StringValuePtr(_hit_ary); + unsigned int hit_ary_size = FIX2UINT(_hit_ary_size); + unsigned int s_index = FIX2UINT(_s_index); + float min_score = (float) NUM2DBL(_min_score); + + unsigned int q_index = 0; + unsigned int q_count = 0; + unsigned int shared_count = 0; + unsigned int total_count = 0; + float score = 0; + hit new_score = {0, 0, 0}; + + for (q_index = 0; q_index < q_size; q_index++) + { + shared_count = shared_ary[q_index]; + q_count = q_total_ary[q_index]; + total_count = (s_count < q_count) ? s_count : q_count; + + score = shared_count / (float) total_count; + + if (score >= min_score) + { + new_score.q_index = q_index; + new_score.s_index = s_index; + new_score.score = score; + + hit_ary[hit_ary_size] = new_score; + + hit_ary_size++; + } + } + + return UINT2NUM(hit_ary_size); + } + } + + # Method that fetches all hits matching a given q_index value + # from the result_ary. Since the result_ary is sorted according + # to q_index we start at a given hit_index and fetch until + # the next hit with a different q_index. The fetched hits are + # stored in an array which is sorted according to decreasing + # score. The size of this array is returned. + builder.c %{ + VALUE get_hits_c( + VALUE _result_ary, // Result array + VALUE _result_ary_size, // Result array size + VALUE _hit_index, // Offset position in hit_ary to search from. + VALUE _hit_ary, // Array to store hits in. + VALUE _q_index) // q_index to fetch. + { + hit *result_ary = (hit *) StringValuePtr(_result_ary); + unsigned int result_ary_size = FIX2UINT(_result_ary_size); + unsigned int hit_index = FIX2UINT(_hit_index); + hit *hit_ary = (hit *) StringValuePtr(_hit_ary); + unsigned int q_index = FIX2UINT(_q_index); + hit new_hit = {0, 0, 0}; + unsigned int hit_ary_size = 0; + + while ((hit_index + hit_ary_size) < result_ary_size) + { + new_hit = result_ary[hit_index + hit_ary_size]; + + if (new_hit.q_index == q_index) + { + hit_ary[hit_ary_size] = new_hit; + + hit_ary_size++; + } + else + { + break; + } + } + + qsort(hit_ary, hit_ary_size, sizeof(hit), hit_cmp_by_score); + + return UINT2NUM(hit_ary_size); + } + } + end + + # >>>>>>>>>>>>>>> Embedded classes <<<<<<<<<<<<<<< + + # Class for holding score information. + class Score + attr_reader :score, :s_index + + # Method to initialize Score object with + # a subject sequence index and a score. + def initialize(score, s_index) + @s_index = s_index + @score = score + end + + # Method for outputting score objects. + def to_s + "#{@s_index}:#{@score.round(2)}" + end + end +end + +__END__