code_ruby/lib/maasha/findsim.rb

   1 # Copyright (C) 2007-2012 Martin A. Hansen.
   2
   3 # This program is free software; you can redistribute it and/or
   4 # modify it under the terms of the GNU General Public License
   5 # as published by the Free Software Foundation; either version 2
   6 # of the License, or (at your option) any later version.
   7
   8 # This program is distributed in the hope that it will be useful,
   9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11 # GNU General Public License for more details.
  12
  13 # You should have received a copy of the GNU General Public License
  14 # along with this program; if not, write to the Free Software
  15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  16
  17 # http://www.gnu.org/copyleft/gpl.html
  18
  19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  20
  21 # This software is part of the Biopieces framework (www.biopieces.org).
  22
  23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  24
  25 require 'inline'
  26 require 'maasha/fasta'
  27
  28 BYTES_IN_INT   = 4
  29 BYTES_IN_FLOAT = 4
  30 BYTES_IN_HIT   = 12
  31 RESULT_ARY_MAX = 50_000_000   # Maximum size for the result_ary.
  32 HIT_ARY_MAX    = 100_000      # Maximum size for the hit_ary.
  33
  34 # FindSim is an implementation of the SimRank logic proposed by Niels Larsen.
  35 # The purpose is to find similarities between query DNA/RNA sequences and a
  36 # database of small subunit ribosomal DNA/RNA sequences. The sequence
  37 # similarities are calculated as the number of shared unique oligos/kmers
  38 # divided by the smallest number of unique oligoes in either the query or
  39 # database sequence. This yields a rough under estimate of similarity e.g. 50%
  40 # oligo similarity may correspond to 80% similarity on a nucleotide level
  41 # (needs clarification). The outcome of FindSim is a table with a row per
  42 # query sequence and the columns are the database hits sorted according to
  43 # similarity.
  44 #
  45 # Extensive use of inline C for speed.
  46 class FindSim
  47   include Enumerable
  48
  49   # Method to initialize FindSim Object.
  50   def initialize(opt_hash)
  51     @opt_hash     = opt_hash
  52     @q_size       = 0
  53     @q_ary        = nil
  54     @q_begs_ary   = nil
  55     @q_ends_ary   = nil
  56     @q_total_ary  = nil
  57     @result       = nil
  58     @result_count = 0
  59     @q_ids        = []
  60     @s_ids        = []
  61   end
  62
  63   # Method to load sequences from a query file in FASTA format
  64   # and index these for oligo based similarty search.
  65   def load_query(file)
  66     time       = Time.now
  67     q_total    = []
  68     oligo_hash = Hash.new { |h, k| h[k] = [] }
  69
  70     count = 0
  71
  72     Fasta.open(file, 'r') do |ios|
  73       ios.each do |entry|
  74         @q_ids << entry.seq_name if @opt_hash[:query_ids]
  75
  76         oligos = str_to_oligo_rb_ary_c(entry.seq, @opt_hash[:kmer], 1).uniq.sort
  77
  78         q_total << oligos.size
  79
  80         oligos.each do |oligo|
  81           oligo_hash[oligo] << count
  82         end
  83
  84         count += 1
  85       end
  86     end
  87
  88     @q_size = count
  89
  90     create_query_index(q_total, oligo_hash)
  91
  92     $stderr.puts "Loaded #{count} query sequences in #{(Time.now - time)} seconds." if @opt_hash[:verbose]
  93   end
  94
  95   # Method to search database or subject sequences from a FASTA file by
  96   # locating for each sequence all shared oligos with the query index.
  97   def search_db(file)
  98     time           = Time.now
  99     oligo_ary      = "\0" * (4 ** @opt_hash[:kmer]) * BYTES_IN_INT
 100     shared_ary     = "\0" * @q_size                 * BYTES_IN_INT
 101     result_ary     = "\0" * RESULT_ARY_MAX          * BYTES_IN_HIT
 102     result_count   = 0
 103
 104     Fasta.open(file, 'r') do |ios|
 105       ios.each_with_index do |entry, s_index|
 106         @s_ids << entry.seq_name if @opt_hash[:subject_ids]
 107
 108         zero_ary_c(oligo_ary,  (4 ** @opt_hash[:kmer]) * BYTES_IN_INT)
 109         zero_ary_c(shared_ary, @q_size                 * BYTES_IN_INT)
 110
 111         oligo_ary_size = str_to_oligo_ary_c(entry.seq, entry.len, oligo_ary, @opt_hash[:kmer], @opt_hash[:step])
 112
 113         count_shared_c(@q_ary, @q_begs_ary, @q_ends_ary, oligo_ary, oligo_ary_size, shared_ary)
 114
 115         result_count = calc_scores_c(@q_total_ary, shared_ary, @q_size, oligo_ary_size, result_ary, result_count, s_index, @opt_hash[:min_score])
 116
 117         if ((s_index + 1) % 1000) == 0 and @opt_hash[:verbose]
 118           $stderr.puts "Searched #{s_index + 1} sequences in #{Time.now - time} seconds (#{result_count} hits)."
 119         end
 120       end
 121     end
 122
 123     @result       = result_ary[0 ... result_count * BYTES_IN_HIT]
 124     @result_count = result_count
 125   end
 126
 127   # Method that for each query index yields all hits, sorted according to
 128   # decending score, as Hit objects.
 129   def each
 130     sort_hits_c(@result, @result_count)
 131
 132     hit_ary = "\0" * HIT_ARY_MAX * BYTES_IN_HIT
 133
 134     hit_index = 0
 135
 136     (0 ... @q_size).each do |q_index|
 137       zero_ary_c(hit_ary, HIT_ARY_MAX * BYTES_IN_HIT)
 138       hit_ary_size = get_hits_c(@result, @result_count, hit_index, hit_ary, q_index)
 139
 140       if @opt_hash[:max_hits]
 141         max = (hit_ary_size > @opt_hash[:max_hits]) ? @opt_hash[:max_hits] : hit_ary_size
 142       else
 143         max = hit_ary_size
 144       end
 145
 146       (0 ... max).each do |i|
 147         q_index, s_index, score = hit_ary[BYTES_IN_HIT * i ... BYTES_IN_HIT * i + BYTES_IN_HIT].unpack("IIF")
 148
 149         q_id = @opt_hash[:query_ids]   ? @q_ids[q_index] : q_index
 150         s_id = @opt_hash[:subject_ids] ? @s_ids[s_index] : s_index
 151
 152         yield Hit.new(q_id, s_id, score)
 153       end
 154
 155       hit_index += hit_ary_size
 156     end
 157
 158     self
 159   end
 160
 161   private
 162
 163   # Method to create the query index for FindSim search. The index consists of
 164   # four lookup arrays which are stored as instance variables:
 165   # *  @q_total_ary holds per index the total of unique oligos for that
 166   #    particular query sequences.
 167   # *  @q_ary holds sequencially lists of query indexes so it is possible
 168   #    to lookup the list for a particular oligo and get all query indeces for
 169   #    query sequences containing this oligo.
 170   # *  @q_begs_ary holds per index the begin coordinate for the @q_ary list for
 171   #    a particular oligo.
 172   # *  @q_ends_ary holds per index the end coordinate for the @q_ary list for a
 173   #    particular oligo.
 174   def create_query_index(q_total, oligo_hash)
 175     @q_total_ary = q_total.pack("I*")
 176
 177     @q_ary = ""
 178
 179     beg        = 0
 180     oligo_begs = Array.new(4 ** @opt_hash[:kmer], 0)
 181     oligo_ends = Array.new(4 ** @opt_hash[:kmer], 0)
 182
 183     oligo_hash.each do |oligo, list|
 184       @q_ary << list.pack("I*")
 185       oligo_begs[oligo] = beg
 186       oligo_ends[oligo] = beg + list.size
 187
 188       beg += list.size
 189     end
 190
 191     @q_begs_ary = oligo_begs.pack("I*")
 192     @q_ends_ary = oligo_ends.pack("I*")
 193   end
 194
 195   # >>>>>>>>>>>>>>> RubyInline C code <<<<<<<<<<<<<<<
 196
 197   inline do |builder|
 198     # Bitmap lookup table where the index is ASCII values
 199     # and the following nucleotides are encoded as:
 200     # T or t = 1
 201     # U or u = 1
 202     # C or c = 2
 203     # G or g = 3
 204     builder.prefix %{
 205       unsigned char oligo_map[256] = {
 206         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 207         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 208         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 209         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 210         0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
 211         0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 212         0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
 213         0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 214         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 215         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 216         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 217         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 218         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 219         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 220         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 221         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 222       };
 223     }
 224
 225     # Defining a hit struct to hold hits consisting of query
 226     # sequence index, subject sequence index and score.
 227     builder.prefix %{
 228       typedef struct
 229       {
 230         unsigned int q_index;
 231         unsigned int s_index;
 232         float          score;
 233       } hit;
 234     }
 235
 236     # Qsort unsigned int comparison function.
 237     # Returns negative if b > a and positive if a > b.
 238     builder.prefix %{
 239       int uint_cmp(const void *a, const void *b)
 240       {
 241         const unsigned int *ia = (const unsigned int *) a;
 242         const unsigned int *ib = (const unsigned int *) b;
 243
 244         return *ia - *ib;
 245       }
 246     }
 247
 248     # Qsort hit struct comparision function of q_index.
 249     # Returns negative if b > a and positive if a > b.
 250     builder.prefix %{
 251       int hit_cmp_by_q_index(const void *a, const void *b)
 252       {
 253         hit *ia = (hit *) a;
 254         hit *ib = (hit *) b;
 255
 256         return (int) (ia->q_index - ib->q_index);
 257       }
 258     }
 259
 260     # Qsort hit struct comparision function of score.
 261     # Returns negative if b < a and positive if a < b.
 262     # We multiply with 1000 to maintain 2 decimal floats.
 263     builder.prefix %{
 264       int hit_cmp_by_score(const void *a, const void *b)
 265       {
 266         hit *ia = (hit *) a;
 267         hit *ib = (hit *) b;
 268
 269         return (int) (1000 * ib->score - 1000 * ia->score);
 270       }
 271     }
 272
 273     # Given a sorted unsigned integer removes all duplicate values
 274     # and move the unique values to the front of the array. Returns
 275     # the new size of the array.
 276     builder.prefix %{
 277       unsigned int uniq_ary(unsigned int *ary, unsigned int ary_size)
 278       {
 279         unsigned int new_ary_size = 0;
 280         unsigned int i            = 0;
 281         unsigned int j            = 0;
 282
 283         for (i = 1; i < ary_size; i++)
 284         {
 285           if (ary[i] != ary[j])
 286           {
 287             j++;
 288             ary[j] = ary[i]; // Move it to the front
 289           }
 290         }
 291
 292         new_ary_size = j + 1;
 293
 294         return new_ary_size;
 295       }
 296     }
 297
 298     # Method that given a byte array and its size in bytes
 299     # sets all bytes to 0.
 300     builder.c %{
 301       void zero_ary_c(
 302         VALUE _ary,       // Byte array to zero.
 303         VALUE _ary_size   // Size of array.
 304       )
 305       {
 306         char         *ary      = (char *) StringValuePtr(_ary);
 307         unsigned int  ary_size = FIX2UINT(_ary_size);
 308
 309         bzero(ary, ary_size);
 310       }
 311     }
 312
 313     # Method to sort array of hits according to q_index.
 314     builder.c %{
 315       void sort_hits_c(
 316         VALUE _ary,       // Array of hits.
 317         VALUE _ary_size   // Size of array.
 318       )
 319       {
 320         hit          *ary      = (hit *) StringValuePtr(_ary);
 321         unsigned int  ary_size = FIX2UINT(_ary_size);
 322
 323         qsort(ary, ary_size, sizeof(hit), hit_cmp_by_q_index);
 324       }
 325     }
 326
 327     # Method that given a string (char array) encodes all kmers overlapping
 328     # with a given step size as integers that are saved in an unsigned integer
 329     # array. Then the array is sorted and uniq'ed and the number of unique
 330     # elements is returned.
 331     builder.c %{
 332       VALUE str_to_oligo_ary_c(
 333         VALUE _str,        // DNA or RNA string.
 334         VALUE _str_size,   // String length.
 335         VALUE _ary,        // Array to store result in.
 336         VALUE _kmer,       // Size of kmers/oligos.
 337         VALUE _step        // Step size for overlapping kmers.
 338       )
 339       {
 340         unsigned char *str      = StringValuePtr(_str);
 341         unsigned int   str_size = FIX2UINT(_str_size);
 342         unsigned int  *ary      = (unsigned int *) StringValuePtr(_ary);
 343         unsigned int   kmer     = FIX2UINT(_kmer);
 344         unsigned int   step     = FIX2UINT(_step);
 345
 346         unsigned int ary_size = 0;
 347         unsigned int mask     = (1 << (2 * kmer)) - 1;
 348         unsigned int bin      = 0;
 349         unsigned int i        = 0;
 350
 351         for (i = 0; i < kmer; i++)
 352         {
 353           bin <<= 2;
 354           bin |= oligo_map[str[i]];
 355         }
 356
 357         ary[ary_size++] = bin;
 358
 359         for (i = kmer; i < str_size; i++)
 360         {
 361           bin <<= 2;
 362           bin |= oligo_map[str[i]];
 363
 364           if ((i % step) == 0) {
 365             ary[ary_size++] = (bin & mask);
 366           }
 367         }
 368
 369         qsort(ary, ary_size, sizeof(unsigned int), uint_cmp);
 370         ary_size = uniq_ary(ary, ary_size);
 371
 372         return UINT2NUM(ary_size);
 373       }
 374     }
 375
 376     # Method that given a string (char array) encodes all kmers overlapping
 377     # with a given step size as integers that are pushed onto a Ruby array
 378     # which is returned.
 379     # TODO should have an option for skipping oligos with ambiguity codes.
 380     builder.c %{
 381       VALUE str_to_oligo_rb_ary_c(
 382         VALUE _str,    // DNA or RNA string.
 383         VALUE _kmer,   // Size of kmer or oligo.
 384         VALUE _step    // Step size for overlapping kmers.
 385       )
 386       {
 387         char         *str   = StringValuePtr(_str);
 388         unsigned int  kmer  = FIX2UINT(_kmer);
 389         unsigned int  step  = FIX2UINT(_step);
 390
 391         VALUE         array = rb_ary_new();
 392         long          len   = strlen(str);
 393         unsigned int  mask  = (1 << (2 * kmer)) - 1;
 394         unsigned int  bin   = 0;
 395         unsigned int  i     = 0;
 396
 397         for (i = 0; i < kmer; i++)
 398         {
 399           bin <<= 2;
 400           bin |= oligo_map[str[i]];
 401         }
 402
 403         rb_ary_push(array, INT2FIX(bin));
 404
 405         for (i = kmer; i < len; i++)
 406         {
 407           bin <<= 2;
 408           bin |= oligo_map[str[i]];
 409
 410           if ((i % step) == 0) {
 411             rb_ary_push(array, INT2FIX((bin & mask)));
 412           }
 413         }
 414
 415         return array;
 416       }
 417     }
 418
 419     # Method that counts all shared oligos/kmers between a subject sequence and
 420     # all query sequences. For each oligo in the subject sequence (s_ary) the
 421     # index of all query sequences containing this oligo is found the the q_ary
 422     # where this information is stored sequentially in intervals. For the
 423     # particula oligo the interval is looked up in the q_beg and q_end arrays.
 424     # Shared oligos are recorded in the shared_ary.
 425     builder.c %{
 426       void count_shared_c(
 427         VALUE _q_ary,        // Lookup array with q_index lists.
 428         VALUE _q_begs_ary,   // Lookup array with interval begins of q_index lists.
 429         VALUE _q_ends_ary,   // Lookup array with interval ends of q_index lists.
 430         VALUE _s_ary,        // Subject sequence oligo array.
 431         VALUE _s_ary_size,   // Subject sequence oligo array size.
 432         VALUE _shared_ary    // Array with shared oligo counts.
 433       )
 434       {
 435         unsigned int *q_ary      = (unsigned int *) StringValuePtr(_q_ary);
 436         unsigned int *q_begs_ary = (unsigned int *) StringValuePtr(_q_begs_ary);
 437         unsigned int *q_ends_ary = (unsigned int *) StringValuePtr(_q_ends_ary);
 438         unsigned int *s_ary      = (unsigned int *) StringValuePtr(_s_ary);
 439         unsigned int  s_ary_size = FIX2UINT(_s_ary_size);
 440         unsigned int *shared_ary = (unsigned int *) StringValuePtr(_shared_ary);
 441
 442         unsigned int oligo = 0;
 443         unsigned int s     = 0;
 444         unsigned int q     = 0;
 445         unsigned int q_beg = 0;
 446         unsigned int q_end = 0;
 447
 448         for (s = 0; s < s_ary_size; s++)  // each oligo do
 449         {
 450           oligo = s_ary[s];
 451
 452           q_beg = q_begs_ary[oligo];
 453           q_end = q_ends_ary[oligo];
 454
 455           for (q = q_beg; q < q_end; q++)
 456           {
 457             shared_ary[q_ary[q]]++;
 458           }
 459         }
 460       }
 461     }
 462
 463     # Method to calculate the score for all query vs subject sequence comparisons.
 464     # The score is calculated as the number of unique shared oligos divided by the
 465     # smallest number of unique oligos in either the subject or query sequence.
 466     # Only scores greater than or equal to a given minimum is stored in the hit
 467     # array and the size of the hit array is returned.
 468     builder.c %{
 469       VALUE calc_scores_c(
 470         VALUE _q_total_ary,    // Lookup array with total oligo counts.
 471         VALUE _shared_ary,     // Lookup arary with shared oligo counts.
 472         VALUE _q_size,         // Number of query sequences.
 473         VALUE _s_count,        // Number of unique oligos in subject sequence.
 474         VALUE _hit_ary,        // Result array.
 475         VALUE _hit_ary_size,   // Result array size.
 476         VALUE _s_index,        // Subject sequence index.
 477         VALUE _min_score       // Minimum score to include.
 478       )
 479       {
 480         unsigned int *q_total_ary  = (unsigned int *) StringValuePtr(_q_total_ary);
 481         unsigned int *shared_ary   = (unsigned int *) StringValuePtr(_shared_ary);
 482         unsigned int  q_size       = FIX2UINT(_q_size);
 483         unsigned int  s_count      = FIX2UINT(_s_count);
 484         hit          *hit_ary      = (hit *) StringValuePtr(_hit_ary);
 485         unsigned int  hit_ary_size = FIX2UINT(_hit_ary_size);
 486         unsigned int  s_index      = FIX2UINT(_s_index);
 487         float         min_score    = (float) NUM2DBL(_min_score);
 488
 489         unsigned int  q_index      = 0;
 490         unsigned int  q_count      = 0;
 491         unsigned int  shared_count = 0;
 492         unsigned int  total_count  = 0;
 493         float         score        = 0;
 494         hit           new_score    = {0, 0, 0};
 495
 496         for (q_index = 0; q_index < q_size; q_index++)
 497         {
 498           shared_count = shared_ary[q_index];
 499           q_count      = q_total_ary[q_index];
 500           total_count  = (s_count < q_count) ? s_count : q_count;
 501
 502           score = shared_count / (float) total_count;
 503
 504           if (score >= min_score)
 505           {
 506             new_score.q_index = q_index;
 507             new_score.s_index = s_index;
 508             new_score.score   = score;
 509
 510             hit_ary[hit_ary_size] = new_score;
 511
 512             hit_ary_size++;
 513           }
 514         }
 515
 516         return UINT2NUM(hit_ary_size);
 517       }
 518     }
 519
 520     # Method that fetches all hits matching a given q_index value
 521     # from the result_ary. Since the result_ary is sorted according
 522     # to q_index we start at a given hit_index and fetch until
 523     # the next hit with a different q_index. The fetched hits are
 524     # stored in an array which is sorted according to decreasing
 525     # score. The size of this array is returned.
 526     builder.c %{
 527       VALUE get_hits_c(
 528         VALUE _result_ary,        // Result array
 529         VALUE _result_ary_size,   // Result array size
 530         VALUE _hit_index,         // Offset position in hit_ary to search from.
 531         VALUE _hit_ary,           // Array to store hits in.
 532         VALUE _q_index)           // q_index to fetch.
 533       {
 534         hit          *result_ary      = (hit *) StringValuePtr(_result_ary);
 535         unsigned int  result_ary_size = FIX2UINT(_result_ary_size);
 536         unsigned int  hit_index       = FIX2UINT(_hit_index);
 537         hit          *hit_ary         = (hit *) StringValuePtr(_hit_ary);
 538         unsigned int  q_index         = FIX2UINT(_q_index);
 539         hit           new_hit         = {0, 0, 0};
 540         unsigned int  hit_ary_size    = 0;
 541
 542         while ((hit_index + hit_ary_size) < result_ary_size)
 543         {
 544           new_hit = result_ary[hit_index + hit_ary_size];
 545
 546           if (new_hit.q_index == q_index)
 547           {
 548             hit_ary[hit_ary_size] = new_hit;
 549
 550             hit_ary_size++;
 551           }
 552           else
 553           {
 554             break;
 555           }
 556         }
 557
 558         qsort(hit_ary, hit_ary_size, sizeof(hit), hit_cmp_by_score);
 559
 560         return UINT2NUM(hit_ary_size);
 561       }
 562     }
 563   end
 564
 565   # >>>>>>>>>>>>>>> Embedded classes <<<<<<<<<<<<<<<
 566
 567   # Class for holding score information.
 568   class Hit
 569     attr_reader :q_id, :s_id, :score
 570
 571     # Method to initialize Hit object with
 572     # query and subject id a score.
 573     def initialize(q_id, s_id, score)
 574       @q_id  = q_id
 575       @s_id  = s_id
 576       @score = score
 577     end
 578
 579     # Method for outputting score objects.
 580     def to_s
 581       "#{@q_id}:#{@s_id}:#{@score.round(2)}"
 582     end
 583
 584     # Method to convert to a Biopiece record.
 585     def to_bp
 586       hash = {}
 587       hash[:REC_TYPE] = "findsim"
 588       hash[:Q_ID]     = @q_id
 589       hash[:S_ID]     = @s_id
 590       hash[:SCORE]    = @score.round(2)
 591
 592       hash
 593     end
 594   end
 595 end
 596
 597 __END__