code_ruby/lib/maasha/findsim.rb

   1 # Copyright (C) 2007-2012 Martin A. Hansen.
   2
   3 # This program is free software; you can redistribute it and/or
   4 # modify it under the terms of the GNU General Public License
   5 # as published by the Free Software Foundation; either version 2
   6 # of the License, or (at your option) any later version.
   7
   8 # This program is distributed in the hope that it will be useful,
   9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11 # GNU General Public License for more details.
  12
  13 # You should have received a copy of the GNU General Public License
  14 # along with this program; if not, write to the Free Software
  15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  16
  17 # http://www.gnu.org/copyleft/gpl.html
  18
  19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  20
  21 # This software is part of the Biopieces framework (www.biopieces.org).
  22
  23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  24
  25 require 'inline'
  26 require 'maasha/fasta'
  27 require 'maasha/align'
  28
  29 BYTES_IN_INT   = 4
  30 BYTES_IN_FLOAT = 4
  31 BYTES_IN_HIT   = 2 * BYTES_IN_INT + 1 * BYTES_IN_FLOAT   # i.e. 12
  32 NUC_ALPH_SIZE  = 4            # Alphabet size of nucleotides.
  33 RESULT_ARY_MAX = 50_000_000   # Maximum size for the result_ary.
  34 HIT_ARY_MAX    = 100_000      # Maximum size for the hit_ary.
  35
  36 # FindSim is an implementation of the SimRank logic proposed by Niels Larsen.
  37 # The purpose is to find similarities between query DNA/RNA sequences and a
  38 # database of small subunit ribosomal DNA/RNA sequences. The sequence
  39 # similarities are calculated as the number of shared unique oligos/kmers
  40 # divided by the smallest number of unique oligoes in either the query or
  41 # database sequence. This yields a rough under estimate of similarity e.g. 50%
  42 # oligo similarity may correspond to 80% similarity on a nucleotide level
  43 # (needs clarification). The outcome of FindSim is a table with a row per
  44 # query sequence and the columns are the database hits sorted according to
  45 # similarity.
  46 #
  47 # Extensive use of inline C for speed.
  48 class FindSim
  49   include Enumerable
  50
  51   # Method to initialize FindSim Object.
  52   def initialize(opt_hash)
  53     @opt_hash     = opt_hash
  54     @q_size       = 0
  55     @q_ary        = nil
  56     @q_begs_ary   = nil
  57     @q_ends_ary   = nil
  58     @q_total_ary  = nil
  59     @result       = nil
  60     @result_count = 0
  61     @q_ids        = []
  62     @s_ids        = []
  63     @q_entries    = []
  64     @s_entries    = []
  65   end
  66
  67   # Method to load sequences from a query file in FASTA format
  68   # and index these for oligo based similarty search.
  69   def load_query(file)
  70     time       = Time.now
  71     q_total    = []
  72     oligo_hash = Hash.new { |h, k| h[k] = [] }
  73
  74     count = 0
  75
  76     Fasta.open(file, 'r') do |ios|
  77       ios.each do |entry|
  78         @q_ids     << entry.seq_name if @opt_hash[:query_ids]
  79         @q_entries << entry          if @opt_hash[:realign]
  80
  81         oligos = str_to_oligo_rb_ary_c(entry.seq, @opt_hash[:kmer], 1).uniq.sort
  82
  83         q_total << oligos.size
  84
  85         oligos.each do |oligo|
  86           oligo_hash[oligo] << count
  87         end
  88
  89         count += 1
  90       end
  91     end
  92
  93     @q_size = count
  94
  95     create_query_index(q_total, oligo_hash)
  96
  97     $stderr.puts "Loaded #{count} query sequences in #{(Time.now - time)} seconds." if @opt_hash[:verbose]
  98   end
  99
 100   # Method to search database or subject sequences from a FASTA file by
 101   # locating for each sequence all shared oligos with the query index.
 102   def search_db(file)
 103     time           = Time.now
 104     oligo_ary      = "\0" * (NUC_ALPH_SIZE ** @opt_hash[:kmer]) * BYTES_IN_INT
 105     shared_ary     = "\0" * @q_size                             * BYTES_IN_INT
 106     result_ary     = "\0" * RESULT_ARY_MAX                      * BYTES_IN_HIT
 107     result_count   = 0
 108
 109     Fasta.open(file, 'r') do |ios|
 110       ios.each_with_index do |entry, s_index|
 111         @s_ids     << entry.seq_name if @opt_hash[:subject_ids]
 112         @s_entries << entry          if @opt_hash[:realign]
 113
 114         zero_ary_c(oligo_ary,  (NUC_ALPH_SIZE ** @opt_hash[:kmer]) * BYTES_IN_INT)
 115         zero_ary_c(shared_ary, @q_size                             * BYTES_IN_INT)
 116
 117         oligo_ary_size = str_to_oligo_ary_c(entry.seq, entry.len, oligo_ary, @opt_hash[:kmer], @opt_hash[:step])
 118
 119         count_shared_c(@q_ary, @q_begs_ary, @q_ends_ary, oligo_ary, oligo_ary_size, shared_ary)
 120
 121         result_count = calc_scores_c(@q_total_ary, shared_ary, @q_size, oligo_ary_size, result_ary, result_count, s_index, @opt_hash[:min_score])
 122
 123         if ((s_index + 1) % 1000) == 0 and @opt_hash[:verbose]
 124           $stderr.puts "Searched #{s_index + 1} sequences in #{Time.now - time} seconds (#{result_count} hits)."
 125         end
 126       end
 127     end
 128
 129     @result       = result_ary[0 ... result_count * BYTES_IN_HIT]
 130     @result_count = result_count
 131   end
 132
 133   # Method that for each query index yields all hits, sorted according to
 134   # decending score, as Hit objects.
 135   def each
 136     sort_hits_c(@result, @result_count)
 137
 138     hit_ary = "\0" * HIT_ARY_MAX * BYTES_IN_HIT
 139
 140     hit_index = 0
 141
 142     (0 ... @q_size).each do |q_index|
 143       zero_ary_c(hit_ary, HIT_ARY_MAX * BYTES_IN_HIT)
 144       hit_ary_size = get_hits_c(@result, @result_count, hit_index, hit_ary, q_index)
 145
 146       if @opt_hash[:max_hits]
 147         max = (hit_ary_size > @opt_hash[:max_hits]) ? @opt_hash[:max_hits] : hit_ary_size
 148       else
 149         max = hit_ary_size
 150       end
 151
 152       best_score = 0
 153
 154       (0 ... max).each do |i|
 155         q_index, s_index, score = hit_ary[BYTES_IN_HIT * i ... BYTES_IN_HIT * i + BYTES_IN_HIT].unpack("IIF")
 156
 157         q_id = @opt_hash[:query_ids]   ? @q_ids[q_index] : q_index
 158         s_id = @opt_hash[:subject_ids] ? @s_ids[s_index] : s_index
 159
 160         if @opt_hash[:realign]
 161           new_score = Align.muscle([@q_entries[q_index], @s_entries[s_index]]).identity
 162           score     = new_score if new_score > score
 163         end
 164
 165         if @opt_hash[:max_diversity]
 166           best_score = score if i == 0
 167
 168           break if best_score - score > (@opt_hash[:max_diversity] / 100)
 169         end
 170
 171         yield Hit.new(q_id, s_id, score)
 172       end
 173
 174       hit_index += hit_ary_size
 175     end
 176
 177     self
 178   end
 179
 180   private
 181
 182   # Method to create the query index for FindSim search. The index consists of
 183   # four lookup arrays which are stored as instance variables:
 184   # *  @q_total_ary holds per index the total of unique oligos for that
 185   #    particular query sequences.
 186   # *  @q_ary holds sequencially lists of query indexes so it is possible
 187   #    to lookup the list for a particular oligo and get all query indeces for
 188   #    query sequences containing this oligo.
 189   # *  @q_begs_ary holds per index the begin coordinate for the @q_ary list for
 190   #    a particular oligo.
 191   # *  @q_ends_ary holds per index the end coordinate for the @q_ary list for a
 192   #    particular oligo.
 193   def create_query_index(q_total, oligo_hash)
 194     @q_total_ary = q_total.pack("I*")
 195
 196     @q_ary = ""
 197
 198     beg        = 0
 199     oligo_begs = Array.new(NUC_ALPH_SIZE ** @opt_hash[:kmer], 0)
 200     oligo_ends = Array.new(NUC_ALPH_SIZE ** @opt_hash[:kmer], 0)
 201
 202     oligo_hash.each do |oligo, list|
 203       @q_ary << list.pack("I*")
 204       oligo_begs[oligo] = beg
 205       oligo_ends[oligo] = beg + list.size
 206
 207       beg += list.size
 208     end
 209
 210     @q_begs_ary = oligo_begs.pack("I*")
 211     @q_ends_ary = oligo_ends.pack("I*")
 212   end
 213
 214   # >>>>>>>>>>>>>>> RubyInline C code <<<<<<<<<<<<<<<
 215
 216   inline do |builder|
 217     # Bitmap lookup table where the index is ASCII values
 218     # and the following nucleotides are encoded as:
 219     # T or t = 1
 220     # U or u = 1
 221     # C or c = 2
 222     # G or g = 3
 223     builder.prefix %{
 224       unsigned char oligo_map[256] = {
 225         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 226         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 227         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 228         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 229         0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
 230         0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 231         0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
 232         0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 233         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 234         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 235         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 236         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 237         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 238         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 239         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 240         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 241       };
 242     }
 243
 244     # Defining a hit struct to hold hits consisting of query
 245     # sequence index, subject sequence index and score.
 246     builder.prefix %{
 247       typedef struct
 248       {
 249         unsigned int q_index;
 250         unsigned int s_index;
 251         float          score;
 252       } hit;
 253     }
 254
 255     # Qsort unsigned int comparison function.
 256     # Returns negative if b > a and positive if a > b.
 257     builder.prefix %{
 258       int uint_cmp(const void *a, const void *b)
 259       {
 260         const unsigned int *ia = (const unsigned int *) a;
 261         const unsigned int *ib = (const unsigned int *) b;
 262
 263         return *ia - *ib;
 264       }
 265     }
 266
 267     # Qsort hit struct comparision function of q_index.
 268     # Returns negative if b > a and positive if a > b.
 269     builder.prefix %{
 270       int hit_cmp_by_q_index(const void *a, const void *b)
 271       {
 272         hit *ia = (hit *) a;
 273         hit *ib = (hit *) b;
 274
 275         return (int) (ia->q_index - ib->q_index);
 276       }
 277     }
 278
 279     # Qsort hit struct comparision function of score.
 280     # Returns negative if b < a and positive if a < b.
 281     # We multiply with 1000 to maintain 2 decimal floats.
 282     builder.prefix %{
 283       int hit_cmp_by_score(const void *a, const void *b)
 284       {
 285         hit *ia = (hit *) a;
 286         hit *ib = (hit *) b;
 287
 288         return (int) (1000 * ib->score - 1000 * ia->score);
 289       }
 290     }
 291
 292     # Given a sorted unsigned integer removes all duplicate values
 293     # and move the unique values to the front of the array. Returns
 294     # the new size of the array.
 295     builder.prefix %{
 296       unsigned int uniq_ary(unsigned int *ary, unsigned int ary_size)
 297       {
 298         unsigned int new_ary_size = 0;
 299         unsigned int i            = 0;
 300         unsigned int j            = 0;
 301
 302         for (i = 1; i < ary_size; i++)
 303         {
 304           if (ary[i] != ary[j])
 305           {
 306             j++;
 307             ary[j] = ary[i]; // Move it to the front
 308           }
 309         }
 310
 311         new_ary_size = j + 1;
 312
 313         return new_ary_size;
 314       }
 315     }
 316
 317     # Method that given a byte array and its size in bytes
 318     # sets all bytes to 0.
 319     builder.c %{
 320       void zero_ary_c(
 321         VALUE _ary,       // Byte array to zero.
 322         VALUE _ary_size   // Size of array.
 323       )
 324       {
 325         char         *ary      = (char *) StringValuePtr(_ary);
 326         unsigned int  ary_size = FIX2UINT(_ary_size);
 327
 328         bzero(ary, ary_size);
 329       }
 330     }
 331
 332     # Method to sort array of hits according to q_index.
 333     builder.c %{
 334       void sort_hits_c(
 335         VALUE _ary,       // Array of hits.
 336         VALUE _ary_size   // Size of array.
 337       )
 338       {
 339         hit          *ary      = (hit *) StringValuePtr(_ary);
 340         unsigned int  ary_size = FIX2UINT(_ary_size);
 341
 342         qsort(ary, ary_size, sizeof(hit), hit_cmp_by_q_index);
 343       }
 344     }
 345
 346     # Method that given a string (char array) encodes all kmers overlapping
 347     # with a given step size as integers that are saved in an unsigned integer
 348     # array. Then the array is sorted and uniq'ed and the number of unique
 349     # elements is returned.
 350     builder.c %{
 351       VALUE str_to_oligo_ary_c(
 352         VALUE _str,        // DNA or RNA string.
 353         VALUE _str_size,   // String length.
 354         VALUE _ary,        // Array to store result in.
 355         VALUE _kmer,       // Size of kmers/oligos.
 356         VALUE _step        // Step size for overlapping kmers.
 357       )
 358       {
 359         unsigned char *str      = StringValuePtr(_str);
 360         unsigned int   str_size = FIX2UINT(_str_size);
 361         unsigned int  *ary      = (unsigned int *) StringValuePtr(_ary);
 362         unsigned int   kmer     = FIX2UINT(_kmer);
 363         unsigned int   step     = FIX2UINT(_step);
 364
 365         unsigned int ary_size = 0;
 366         unsigned int mask     = (1 << (2 * kmer)) - 1;
 367         unsigned int bin      = 0;
 368         unsigned int i        = 0;
 369
 370         for (i = 0; i < kmer; i++)
 371         {
 372           bin <<= 2;
 373           bin |= oligo_map[str[i]];
 374         }
 375
 376         ary[ary_size++] = bin;
 377
 378         for (i = kmer; i < str_size; i++)
 379         {
 380           bin <<= 2;
 381           bin |= oligo_map[str[i]];
 382
 383           if ((i % step) == 0) {
 384             ary[ary_size++] = (bin & mask);
 385           }
 386         }
 387
 388         qsort(ary, ary_size, sizeof(unsigned int), uint_cmp);
 389         ary_size = uniq_ary(ary, ary_size);
 390
 391         return UINT2NUM(ary_size);
 392       }
 393     }
 394
 395     # Method that given a string (char array) encodes all kmers overlapping
 396     # with a given step size as integers that are pushed onto a Ruby array
 397     # which is returned.
 398     # TODO should have an option for skipping oligos with ambiguity codes.
 399     builder.c %{
 400       VALUE str_to_oligo_rb_ary_c(
 401         VALUE _str,    // DNA or RNA string.
 402         VALUE _kmer,   // Size of kmer or oligo.
 403         VALUE _step    // Step size for overlapping kmers.
 404       )
 405       {
 406         char         *str   = StringValuePtr(_str);
 407         unsigned int  kmer  = FIX2UINT(_kmer);
 408         unsigned int  step  = FIX2UINT(_step);
 409
 410         VALUE         array = rb_ary_new();
 411         long          len   = strlen(str);
 412         unsigned int  mask  = (1 << (2 * kmer)) - 1;
 413         unsigned int  bin   = 0;
 414         unsigned int  i     = 0;
 415
 416         for (i = 0; i < kmer; i++)
 417         {
 418           bin <<= 2;
 419           bin |= oligo_map[str[i]];
 420         }
 421
 422         rb_ary_push(array, INT2FIX(bin));
 423
 424         for (i = kmer; i < len; i++)
 425         {
 426           bin <<= 2;
 427           bin |= oligo_map[str[i]];
 428
 429           if ((i % step) == 0) {
 430             rb_ary_push(array, INT2FIX((bin & mask)));
 431           }
 432         }
 433
 434         return array;
 435       }
 436     }
 437
 438     # Method that counts all shared oligos/kmers between a subject sequence and
 439     # all query sequences. For each oligo in the subject sequence (s_ary) the
 440     # index of all query sequences containing this oligo is found the the q_ary
 441     # where this information is stored sequentially in intervals. For the
 442     # particula oligo the interval is looked up in the q_beg and q_end arrays.
 443     # Shared oligos are recorded in the shared_ary.
 444     builder.c %{
 445       void count_shared_c(
 446         VALUE _q_ary,        // Lookup array with q_index lists.
 447         VALUE _q_begs_ary,   // Lookup array with interval begins of q_index lists.
 448         VALUE _q_ends_ary,   // Lookup array with interval ends of q_index lists.
 449         VALUE _s_ary,        // Subject sequence oligo array.
 450         VALUE _s_ary_size,   // Subject sequence oligo array size.
 451         VALUE _shared_ary    // Array with shared oligo counts.
 452       )
 453       {
 454         unsigned int *q_ary      = (unsigned int *) StringValuePtr(_q_ary);
 455         unsigned int *q_begs_ary = (unsigned int *) StringValuePtr(_q_begs_ary);
 456         unsigned int *q_ends_ary = (unsigned int *) StringValuePtr(_q_ends_ary);
 457         unsigned int *s_ary      = (unsigned int *) StringValuePtr(_s_ary);
 458         unsigned int  s_ary_size = FIX2UINT(_s_ary_size);
 459         unsigned int *shared_ary = (unsigned int *) StringValuePtr(_shared_ary);
 460
 461         unsigned int oligo = 0;
 462         unsigned int s     = 0;
 463         unsigned int q     = 0;
 464         unsigned int q_beg = 0;
 465         unsigned int q_end = 0;
 466
 467         for (s = 0; s < s_ary_size; s++)  // each oligo do
 468         {
 469           oligo = s_ary[s];
 470
 471           q_beg = q_begs_ary[oligo];
 472           q_end = q_ends_ary[oligo];
 473
 474           for (q = q_beg; q < q_end; q++)
 475           {
 476             shared_ary[q_ary[q]]++;
 477           }
 478         }
 479       }
 480     }
 481
 482     # Method to calculate the score for all query vs subject sequence comparisons.
 483     # The score is calculated as the number of unique shared oligos divided by the
 484     # smallest number of unique oligos in either the subject or query sequence.
 485     # Only scores greater than or equal to a given minimum is stored in the hit
 486     # array and the size of the hit array is returned.
 487     builder.c %{
 488       VALUE calc_scores_c(
 489         VALUE _q_total_ary,    // Lookup array with total oligo counts.
 490         VALUE _shared_ary,     // Lookup arary with shared oligo counts.
 491         VALUE _q_size,         // Number of query sequences.
 492         VALUE _s_count,        // Number of unique oligos in subject sequence.
 493         VALUE _hit_ary,        // Result array.
 494         VALUE _hit_ary_size,   // Result array size.
 495         VALUE _s_index,        // Subject sequence index.
 496         VALUE _min_score       // Minimum score to include.
 497       )
 498       {
 499         unsigned int *q_total_ary  = (unsigned int *) StringValuePtr(_q_total_ary);
 500         unsigned int *shared_ary   = (unsigned int *) StringValuePtr(_shared_ary);
 501         unsigned int  q_size       = FIX2UINT(_q_size);
 502         unsigned int  s_count      = FIX2UINT(_s_count);
 503         hit          *hit_ary      = (hit *) StringValuePtr(_hit_ary);
 504         unsigned int  hit_ary_size = FIX2UINT(_hit_ary_size);
 505         unsigned int  s_index      = FIX2UINT(_s_index);
 506         float         min_score    = (float) NUM2DBL(_min_score);
 507
 508         unsigned int  q_index      = 0;
 509         unsigned int  q_count      = 0;
 510         unsigned int  shared_count = 0;
 511         unsigned int  total_count  = 0;
 512         float         score        = 0;
 513         hit           new_score    = {0, 0, 0};
 514
 515         for (q_index = 0; q_index < q_size; q_index++)
 516         {
 517           shared_count = shared_ary[q_index];
 518           q_count      = q_total_ary[q_index];
 519           total_count  = (s_count < q_count) ? s_count : q_count;
 520
 521           score = shared_count / (float) total_count;
 522
 523           if (score >= min_score)
 524           {
 525             new_score.q_index = q_index;
 526             new_score.s_index = s_index;
 527             new_score.score   = score;
 528
 529             hit_ary[hit_ary_size] = new_score;
 530
 531             hit_ary_size++;
 532           }
 533         }
 534
 535         return UINT2NUM(hit_ary_size);
 536       }
 537     }
 538
 539     # Method that fetches all hits matching a given q_index value
 540     # from the result_ary. Since the result_ary is sorted according
 541     # to q_index we start at a given hit_index and fetch until
 542     # the next hit with a different q_index. The fetched hits are
 543     # stored in an array which is sorted according to decreasing
 544     # score. The size of this array is returned.
 545     builder.c %{
 546       VALUE get_hits_c(
 547         VALUE _result_ary,        // Result array
 548         VALUE _result_ary_size,   // Result array size
 549         VALUE _hit_index,         // Offset position in hit_ary to search from.
 550         VALUE _hit_ary,           // Array to store hits in.
 551         VALUE _q_index)           // q_index to fetch.
 552       {
 553         hit          *result_ary      = (hit *) StringValuePtr(_result_ary);
 554         unsigned int  result_ary_size = FIX2UINT(_result_ary_size);
 555         unsigned int  hit_index       = FIX2UINT(_hit_index);
 556         hit          *hit_ary         = (hit *) StringValuePtr(_hit_ary);
 557         unsigned int  q_index         = FIX2UINT(_q_index);
 558         hit           new_hit         = {0, 0, 0};
 559         unsigned int  hit_ary_size    = 0;
 560
 561         while ((hit_index + hit_ary_size) < result_ary_size)
 562         {
 563           new_hit = result_ary[hit_index + hit_ary_size];
 564
 565           if (new_hit.q_index == q_index)
 566           {
 567             hit_ary[hit_ary_size] = new_hit;
 568
 569             hit_ary_size++;
 570           }
 571           else
 572           {
 573             break;
 574           }
 575         }
 576
 577         qsort(hit_ary, hit_ary_size, sizeof(hit), hit_cmp_by_score);
 578
 579         return UINT2NUM(hit_ary_size);
 580       }
 581     }
 582   end
 583
 584   # >>>>>>>>>>>>>>> Embedded classes <<<<<<<<<<<<<<<
 585
 586   # Class for holding score information.
 587   class Hit
 588     attr_reader :q_id, :s_id, :score
 589
 590     # Method to initialize Hit object with
 591     # query and subject id a score.
 592     def initialize(q_id, s_id, score)
 593       @q_id  = q_id
 594       @s_id  = s_id
 595       @score = score
 596     end
 597
 598     # Method for outputting score objects.
 599     def to_s
 600       "#{@q_id}:#{@s_id}:#{@score.round(2)}"
 601     end
 602
 603     # Method to convert to a Biopiece record.
 604     def to_bp
 605       hash = {}
 606       hash[:REC_TYPE] = "findsim"
 607       hash[:Q_ID]     = @q_id
 608       hash[:S_ID]     = @s_id
 609       hash[:SCORE]    = @score.round(2)
 610
 611       hash
 612     end
 613   end
 614 end
 615
 616 __END__