X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=code_ruby%2Flib%2Fmaasha%2Falign%2Fmatch.rb;h=784a2d12f2a1916adb8a216c821769683697a19a;hb=e95fec9ae319e4da65e7dd5700abf7b4a036b411;hp=721bf137f6e90bd726a5e8e52f4f49ac9cc39c98;hpb=268f4b42e12087be9134b35887ffe6138c3ef3df;p=biopieces.git diff --git a/code_ruby/lib/maasha/align/match.rb b/code_ruby/lib/maasha/align/match.rb index 721bf13..784a2d1 100644 --- a/code_ruby/lib/maasha/align/match.rb +++ b/code_ruby/lib/maasha/align/match.rb @@ -1,4 +1,4 @@ -# Copyright (C) 2007-2012 Martin A. Hansen. +# Copyright (C) 2007-2013 Martin A. Hansen. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -22,70 +22,47 @@ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -require 'profile' +require 'inline' +require 'maasha/math_aux' -# Class containing methods for located all non-redundant Maximally Extended Matches (MEMs) -# between two strings. -class Matches - # Class method to located all non-redudant MEMs bewteen two sequences within a given - # search space and seeded with kmers of a given size. - def self.find(q_seq, s_seq, q_min, s_min, q_max, s_max, kmer) - m = self.new - m.matches_find(q_seq, s_seq, q_min, s_min, q_max, s_max, kmer) - end - - # Method that finds all maximally expanded non-redundant matches shared - # between two sequences inside a given search space. - def matches_find(q_seq, s_seq, q_min, s_min, q_max, s_max, kmer) - matches = [] - redundant = Hash.new { |h, k| h[k] = [] } - - s_index = index_seq(s_seq, s_min, s_max, kmer) - - q_pos = q_min - - while q_pos <= q_max - kmer + 1 - q_oligo = q_seq[q_pos ... q_pos + kmer] +# Class for describing a match between two sequences q and s. +class Match + attr_accessor :q_beg, :s_beg, :length, :score - s_index[q_oligo].each do |s_pos| - match = Match.new(q_pos, s_pos, kmer) - - unless match_redundant?(redundant, match) - match_expand(match, q_seq, s_seq, q_min, s_min, q_max, s_max) - matches << match - - match_redundant_add(redundant, match) - end - end - - q_pos += 1 - end - - matches + def initialize(q_beg, s_beg, length) + @q_beg = q_beg + @s_beg = s_beg + @length = length + @score = 0.0 end - # Method that indexes a sequence within a given interval such that the - # index contains all oligos of a given kmer size and the positions where - # this oligo was located. - def index_seq(seq, min, max, kmer, step = 1) - index_hash = Hash.new { |h, k| h[k] = [] } + def q_end + @q_beg + @length - 1 + end - pos = min + def s_end + @s_beg + @length - 1 + end - while pos <= max - kmer + 1 - oligo = seq[pos ... pos + kmer] - index_hash[oligo] << pos + # Method to expand a match as far as possible to the left and right + # within a given search space. + def expand(q_seq, s_seq, q_space_beg, s_space_beg, q_space_end, s_space_end) + length = expand_left_C(q_seq, s_seq, @q_beg, @s_beg, q_space_beg, s_space_beg) + @length += length + @q_beg -= length + @s_beg -= length - pos += step - end + length = expand_right_C(q_seq, s_seq, q_end, s_end, q_space_end, s_space_end) + @length += length - index_hash + self end - # Method to check if a match is redundant. - def match_redundant?(redundant, match) - redundant[match.q_beg].each do |s_interval| - if s_interval.include? match.s_beg and s_interval.include? match.s_end # TODO test if include? is slow + def redundant?(matches) + # Speed-up with binary search + + matches.each do |m| + if Math.dist_point2line(self.q_beg, self.s_beg, m.q_beg, m.s_beg, m.q_end, m.s_end) == 0 return true end end @@ -93,68 +70,76 @@ class Matches false end - # Method that adds a match to the redundancy index. - def match_redundant_add(redundant, match) - (match.q_beg .. match.q_end).each do |q| - redundant[q] << (match.s_beg .. match.s_end) - end - end - - # Method that expands a match as far as possible to the left and right. - def match_expand(match, q_seq, s_seq, q_min, s_min, q_max, s_max) - match_expand_left(match, q_seq, s_seq, q_min, s_min, q_max, s_max) - match_expand_right(match, q_seq, s_seq, q_min, s_min, q_max, s_max) + def to_s(seq = nil) + s = "q: #{@q_beg}, s: #{@s_beg}, l: #{@length}, s: #{@score}" + s << " #{seq[@q_beg .. q_end]}" if seq - match + s end - # Method that expands a match as far as possible to the left. - def match_expand_left(match, q_seq, s_seq, q_min, s_min, q_max, s_max) - while match.q_beg > q_min and - match.s_beg > s_min and - q_seq[match.q_beg - 1] == s_seq[match.s_beg - 1] - match.q_beg -= 1 - match.s_beg -= 1 - match.length += 1 - end - - match - end - - # Method that expands a match as far as possible to the right. - def match_expand_right(match, q_seq, s_seq, q_min, s_min, q_max, s_max) - while match.q_end < q_max and - match.s_end < s_max and - q_seq[match.q_end + 1] == s_seq[match.s_end + 1] - match.length += 1 - end - - match - end - - # Class for containing a match between two sequences q and s. - class Match - attr_accessor :q_beg, :s_beg, :length, :score - - def initialize(q_beg, s_beg, length, score = 0.0) - @q_beg = q_beg - @s_beg = s_beg - @length = length - @score = score - end - - def q_end - @q_beg + @length - 1 - end - - def s_end - @s_beg + @length - 1 - end - - def to_s(seq = nil) - s = "q: #{@q_beg} #{q_end} s: #{@s_beg} #{s_end} l: #{@length} s: #{@score}" - s << " seq: #{seq[@q_beg .. q_end]}" if seq - s - end + private + + inline do |builder| + # Method to expand a match as far as possible to the left within a given + # search space. + builder.c %{ + VALUE expand_left_C( + VALUE _q_seq, + VALUE _s_seq, + VALUE _q_beg, + VALUE _s_beg, + VALUE _q_space_beg, + VALUE _s_space_beg + ) + { + unsigned char *q_seq = (unsigned char *) StringValuePtr(_q_seq); + unsigned char *s_seq = (unsigned char *) StringValuePtr(_s_seq); + unsigned int q_beg = NUM2UINT(_q_beg); + unsigned int s_beg = NUM2UINT(_s_beg); + unsigned int q_space_beg = NUM2UINT(_q_space_beg); + unsigned int s_space_beg = NUM2UINT(_s_space_beg); + + unsigned int len = 0; + + while (q_beg > q_space_beg && s_beg > s_space_beg && q_seq[q_beg - 1] == s_seq[s_beg - 1]) + { + q_beg--; + s_beg--; + len++; + } + + return UINT2NUM(len); + } + } + + builder.c %{ + VALUE expand_right_C( + VALUE _q_seq, + VALUE _s_seq, + VALUE _q_end, + VALUE _s_end, + VALUE _q_space_end, + VALUE _s_space_end + ) + { + unsigned char *q_seq = (unsigned char *) StringValuePtr(_q_seq); + unsigned char *s_seq = (unsigned char *) StringValuePtr(_s_seq); + unsigned int q_end = NUM2UINT(_q_end); + unsigned int s_end = NUM2UINT(_s_end); + unsigned int q_space_end = NUM2UINT(_q_space_end); + unsigned int s_space_end = NUM2UINT(_s_space_end); + + unsigned int len = 0; + + while (q_end + 1 <= q_space_end && s_end + 1 <= s_space_end && q_seq[q_end + 1] == s_seq[s_end + 1]) + { + q_end++; + s_end++; + len++; + } + + return UINT2NUM(len); + } + } end end