X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=code_ruby%2Flib%2Fmaasha%2Falign%2Fmatch.rb;h=784a2d12f2a1916adb8a216c821769683697a19a;hb=e95fec9ae319e4da65e7dd5700abf7b4a036b411;hp=e4d5fe06627c5346b5fbc6a982ab4b98461e6bf1;hpb=30bc11218fcbb63170e7d759c25b8714d4c95407;p=biopieces.git diff --git a/code_ruby/lib/maasha/align/match.rb b/code_ruby/lib/maasha/align/match.rb index e4d5fe0..784a2d1 100644 --- a/code_ruby/lib/maasha/align/match.rb +++ b/code_ruby/lib/maasha/align/match.rb @@ -1,4 +1,4 @@ -# Copyright (C) 2007-2012 Martin A. Hansen. +# Copyright (C) 2007-2013 Martin A. Hansen. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -22,63 +22,47 @@ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -class Matches - def self.find(q_seq, s_seq, q_min, s_min, q_max, s_max, kmer) - m = self.new - m.matches_find(q_seq, s_seq, q_min, s_min, q_max, s_max, kmer) - end - - # Method that finds all maximally expanded non-redundant matches shared - # between two sequences inside a given search space. - def matches_find(q_seq, s_seq, q_min, s_min, q_max, s_max, kmer) - matches = [] - redundant = Hash.new { |h, k| h[k] = [] } - - s_index = index_seq(s_seq, s_min, s_max, kmer) - - q_pos = q_min - - while q_pos <= q_max - kmer + 1 - q_oligo = q_seq[q_pos ... q_pos + kmer] - - s_index[q_oligo].each do |s_pos| - match = Match.new(q_pos, s_pos, kmer) - unless match_redundant?(redundant, match) - match_expand(match, q_seq, s_seq, q_min, s_min, q_max, s_max) - matches << match - - match_redundant_add(redundant, match) - end - end +require 'inline' +require 'maasha/math_aux' - q_pos += 1 - end +# Class for describing a match between two sequences q and s. +class Match + attr_accessor :q_beg, :s_beg, :length, :score - matches + def initialize(q_beg, s_beg, length) + @q_beg = q_beg + @s_beg = s_beg + @length = length + @score = 0.0 end - # Method that indexes a sequence within a given interval such that the - # index contains all oligos of a given kmer size and the positions where - # this oligo was located. - def index_seq(seq, min, max, kmer) - index_hash = Hash.new { |h, k| h[k] = [] } + def q_end + @q_beg + @length - 1 + end - pos = min + def s_end + @s_beg + @length - 1 + end - while pos <= max - kmer + 1 - oligo = seq[pos ... pos + kmer] - index_hash[oligo] << pos + # Method to expand a match as far as possible to the left and right + # within a given search space. + def expand(q_seq, s_seq, q_space_beg, s_space_beg, q_space_end, s_space_end) + length = expand_left_C(q_seq, s_seq, @q_beg, @s_beg, q_space_beg, s_space_beg) + @length += length + @q_beg -= length + @s_beg -= length - pos += 1 - end + length = expand_right_C(q_seq, s_seq, q_end, s_end, q_space_end, s_space_end) + @length += length - index_hash + self end - # Method to check if a match is redundant. - def match_redundant?(redundant, match) - redundant[match.q_beg].each do |s_interval| - if s_interval.include? match.s_beg and s_interval.include? match.s_end + def redundant?(matches) + # Speed-up with binary search + + matches.each do |m| + if Math.dist_point2line(self.q_beg, self.s_beg, m.q_beg, m.s_beg, m.q_end, m.s_end) == 0 return true end end @@ -86,68 +70,76 @@ class Matches false end - # Method that adds a match to the redundancy index. - def match_redundant_add(redundant, match) - (match.q_beg .. match.q_end).each do |q| - redundant[q] << (match.s_beg .. match.s_end) - end - end - - # Method that expands a match as far as possible to the left and right. - def match_expand(match, q_seq, s_seq, q_min, s_min, q_max, s_max) - match_expand_left(match, q_seq, s_seq, q_min, s_min, q_max, s_max) - match_expand_right(match, q_seq, s_seq, q_min, s_min, q_max, s_max) + def to_s(seq = nil) + s = "q: #{@q_beg}, s: #{@s_beg}, l: #{@length}, s: #{@score}" + s << " #{seq[@q_beg .. q_end]}" if seq - match + s end - # Method that expands a match as far as possible to the left. - def match_expand_left(match, q_seq, s_seq, q_min, s_min, q_max, s_max) - while match.q_beg > q_min and - match.s_beg > s_min and - q_seq[match.q_beg - 1] == s_seq[match.s_beg - 1] - match.q_beg -= 1 - match.s_beg -= 1 - match.length += 1 - end - - match - end - - # Method that expands a match as far as possible to the right. - def match_expand_right(match, q_seq, s_seq, q_min, s_min, q_max, s_max) - while match.q_end < q_max and - match.s_end < s_max and - q_seq[match.q_end + 1] == s_seq[match.s_end + 1] - match.length += 1 - end - - match - end - - # Class for containing a match between two sequences q and s. - class Match - attr_accessor :q_beg, :s_beg, :length, :score - - def initialize(q_beg, s_beg, length, score = 0.0) - @q_beg = q_beg - @s_beg = s_beg - @length = length - @score = score - end - - def q_end - @q_beg + @length - 1 - end - - def s_end - @s_beg + @length - 1 - end - - def to_s(seq = nil) - s = "q: #{@q_beg} #{q_end} s: #{@s_beg} #{s_end} l: #{@length} s: #{@score}" - s << " seq: #{seq[@q_beg .. q_end]}" if seq - s - end + private + + inline do |builder| + # Method to expand a match as far as possible to the left within a given + # search space. + builder.c %{ + VALUE expand_left_C( + VALUE _q_seq, + VALUE _s_seq, + VALUE _q_beg, + VALUE _s_beg, + VALUE _q_space_beg, + VALUE _s_space_beg + ) + { + unsigned char *q_seq = (unsigned char *) StringValuePtr(_q_seq); + unsigned char *s_seq = (unsigned char *) StringValuePtr(_s_seq); + unsigned int q_beg = NUM2UINT(_q_beg); + unsigned int s_beg = NUM2UINT(_s_beg); + unsigned int q_space_beg = NUM2UINT(_q_space_beg); + unsigned int s_space_beg = NUM2UINT(_s_space_beg); + + unsigned int len = 0; + + while (q_beg > q_space_beg && s_beg > s_space_beg && q_seq[q_beg - 1] == s_seq[s_beg - 1]) + { + q_beg--; + s_beg--; + len++; + } + + return UINT2NUM(len); + } + } + + builder.c %{ + VALUE expand_right_C( + VALUE _q_seq, + VALUE _s_seq, + VALUE _q_end, + VALUE _s_end, + VALUE _q_space_end, + VALUE _s_space_end + ) + { + unsigned char *q_seq = (unsigned char *) StringValuePtr(_q_seq); + unsigned char *s_seq = (unsigned char *) StringValuePtr(_s_seq); + unsigned int q_end = NUM2UINT(_q_end); + unsigned int s_end = NUM2UINT(_s_end); + unsigned int q_space_end = NUM2UINT(_q_space_end); + unsigned int s_space_end = NUM2UINT(_s_space_end); + + unsigned int len = 0; + + while (q_end + 1 <= q_space_end && s_end + 1 <= s_space_end && q_seq[q_end + 1] == s_seq[s_end + 1]) + { + q_end++; + s_end++; + len++; + } + + return UINT2NUM(len); + } + } end end