-# Copyright (C) 2007-2012 Martin A. Hansen.
+# Copyright (C) 2007-2013 Martin A. Hansen.
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-require 'profile'
+require 'inline'
+require 'maasha/math_aux'
-# Class containing methods for located all non-redundant Maximally Extended Matches (MEMs)
-# between two strings.
-class Matches
- # Class method to located all non-redudant MEMs bewteen two sequences within a given
- # search space and seeded with kmers of a given size.
- def self.find(q_seq, s_seq, q_min, s_min, q_max, s_max, kmer)
- m = self.new
- m.matches_find(q_seq, s_seq, q_min, s_min, q_max, s_max, kmer)
- end
-
- # Method that finds all maximally expanded non-redundant matches shared
- # between two sequences inside a given search space.
- def matches_find(q_seq, s_seq, q_min, s_min, q_max, s_max, kmer)
- matches = []
- redundant = Hash.new { |h, k| h[k] = [] }
-
- s_index = index_seq(s_seq, s_min, s_max, kmer)
-
- q_pos = q_min
-
- while q_pos <= q_max - kmer + 1
- q_oligo = q_seq[q_pos ... q_pos + kmer]
+# Class for describing a match between two sequences q and s.
+class Match
+ attr_accessor :q_beg, :s_beg, :length, :score
- s_index[q_oligo].each do |s_pos|
- match = Match.new(q_pos, s_pos, kmer)
-
- unless match_redundant?(redundant, match)
- match_expand(match, q_seq, s_seq, q_min, s_min, q_max, s_max)
- matches << match
-
- match_redundant_add(redundant, match)
- end
- end
-
- q_pos += 1
- end
-
- matches
+ def initialize(q_beg, s_beg, length)
+ @q_beg = q_beg
+ @s_beg = s_beg
+ @length = length
+ @score = 0.0
end
- # Method that indexes a sequence within a given interval such that the
- # index contains all oligos of a given kmer size and the positions where
- # this oligo was located.
- def index_seq(seq, min, max, kmer, step = 1)
- index_hash = Hash.new { |h, k| h[k] = [] }
+ def q_end
+ @q_beg + @length - 1
+ end
- pos = min
+ def s_end
+ @s_beg + @length - 1
+ end
- while pos <= max - kmer + 1
- oligo = seq[pos ... pos + kmer]
- index_hash[oligo] << pos
+ # Method to expand a match as far as possible to the left and right
+ # within a given search space.
+ def expand(q_seq, s_seq, q_space_beg, s_space_beg, q_space_end, s_space_end)
+ length = expand_left_C(q_seq, s_seq, @q_beg, @s_beg, q_space_beg, s_space_beg)
+ @length += length
+ @q_beg -= length
+ @s_beg -= length
- pos += step
- end
+ length = expand_right_C(q_seq, s_seq, q_end, s_end, q_space_end, s_space_end)
+ @length += length
- index_hash
+ self
end
- # Method to check if a match is redundant.
- def match_redundant?(redundant, match)
- redundant[match.q_beg].each do |s_interval|
- if s_interval.include? match.s_beg and s_interval.include? match.s_end # TODO test if include? is slow
+ def redundant?(matches)
+ # Speed-up with binary search
+
+ matches.each do |m|
+ if Math.dist_point2line(self.q_beg, self.s_beg, m.q_beg, m.s_beg, m.q_end, m.s_end) == 0
return true
end
end
false
end
- # Method that adds a match to the redundancy index.
- def match_redundant_add(redundant, match)
- (match.q_beg .. match.q_end).each do |q|
- redundant[q] << (match.s_beg .. match.s_end)
- end
- end
-
- # Method that expands a match as far as possible to the left and right.
- def match_expand(match, q_seq, s_seq, q_min, s_min, q_max, s_max)
- match_expand_left(match, q_seq, s_seq, q_min, s_min, q_max, s_max)
- match_expand_right(match, q_seq, s_seq, q_min, s_min, q_max, s_max)
+ def to_s(seq = nil)
+ s = "q: #{@q_beg}, s: #{@s_beg}, l: #{@length}, s: #{@score}"
+ s << " #{seq[@q_beg .. q_end]}" if seq
- match
+ s
end
- # Method that expands a match as far as possible to the left.
- def match_expand_left(match, q_seq, s_seq, q_min, s_min, q_max, s_max)
- while match.q_beg > q_min and
- match.s_beg > s_min and
- q_seq[match.q_beg - 1] == s_seq[match.s_beg - 1]
- match.q_beg -= 1
- match.s_beg -= 1
- match.length += 1
- end
-
- match
- end
-
- # Method that expands a match as far as possible to the right.
- def match_expand_right(match, q_seq, s_seq, q_min, s_min, q_max, s_max)
- while match.q_end < q_max and
- match.s_end < s_max and
- q_seq[match.q_end + 1] == s_seq[match.s_end + 1]
- match.length += 1
- end
-
- match
- end
-
- # Class for containing a match between two sequences q and s.
- class Match
- attr_accessor :q_beg, :s_beg, :length, :score
-
- def initialize(q_beg, s_beg, length, score = 0.0)
- @q_beg = q_beg
- @s_beg = s_beg
- @length = length
- @score = score
- end
-
- def q_end
- @q_beg + @length - 1
- end
-
- def s_end
- @s_beg + @length - 1
- end
-
- def to_s(seq = nil)
- s = "q: #{@q_beg} #{q_end} s: #{@s_beg} #{s_end} l: #{@length} s: #{@score}"
- s << " seq: #{seq[@q_beg .. q_end]}" if seq
- s
- end
+ private
+
+ inline do |builder|
+ # Method to expand a match as far as possible to the left within a given
+ # search space.
+ builder.c %{
+ VALUE expand_left_C(
+ VALUE _q_seq,
+ VALUE _s_seq,
+ VALUE _q_beg,
+ VALUE _s_beg,
+ VALUE _q_space_beg,
+ VALUE _s_space_beg
+ )
+ {
+ unsigned char *q_seq = (unsigned char *) StringValuePtr(_q_seq);
+ unsigned char *s_seq = (unsigned char *) StringValuePtr(_s_seq);
+ unsigned int q_beg = NUM2UINT(_q_beg);
+ unsigned int s_beg = NUM2UINT(_s_beg);
+ unsigned int q_space_beg = NUM2UINT(_q_space_beg);
+ unsigned int s_space_beg = NUM2UINT(_s_space_beg);
+
+ unsigned int len = 0;
+
+ while (q_beg > q_space_beg && s_beg > s_space_beg && q_seq[q_beg - 1] == s_seq[s_beg - 1])
+ {
+ q_beg--;
+ s_beg--;
+ len++;
+ }
+
+ return UINT2NUM(len);
+ }
+ }
+
+ builder.c %{
+ VALUE expand_right_C(
+ VALUE _q_seq,
+ VALUE _s_seq,
+ VALUE _q_end,
+ VALUE _s_end,
+ VALUE _q_space_end,
+ VALUE _s_space_end
+ )
+ {
+ unsigned char *q_seq = (unsigned char *) StringValuePtr(_q_seq);
+ unsigned char *s_seq = (unsigned char *) StringValuePtr(_s_seq);
+ unsigned int q_end = NUM2UINT(_q_end);
+ unsigned int s_end = NUM2UINT(_s_end);
+ unsigned int q_space_end = NUM2UINT(_q_space_end);
+ unsigned int s_space_end = NUM2UINT(_s_space_end);
+
+ unsigned int len = 0;
+
+ while (q_end + 1 <= q_space_end && s_end + 1 <= s_space_end && q_seq[q_end + 1] == s_seq[s_end + 1])
+ {
+ q_end++;
+ s_end++;
+ len++;
+ }
+
+ return UINT2NUM(len);
+ }
+ }
end
end