From 30bc11218fcbb63170e7d759c25b8714d4c95407 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Tue, 11 Sep 2012 09:58:37 +0000 Subject: [PATCH] added align/match.rb git-svn-id: http://biopieces.googlecode.com/svn/trunk@1920 74ccb610-7750-0410-82ae-013aeee3265d --- code_ruby/lib/maasha/align/match.rb | 153 ++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) create mode 100644 code_ruby/lib/maasha/align/match.rb diff --git a/code_ruby/lib/maasha/align/match.rb b/code_ruby/lib/maasha/align/match.rb new file mode 100644 index 0000000..e4d5fe0 --- /dev/null +++ b/code_ruby/lib/maasha/align/match.rb @@ -0,0 +1,153 @@ +# Copyright (C) 2007-2012 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This software is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +class Matches + def self.find(q_seq, s_seq, q_min, s_min, q_max, s_max, kmer) + m = self.new + m.matches_find(q_seq, s_seq, q_min, s_min, q_max, s_max, kmer) + end + + # Method that finds all maximally expanded non-redundant matches shared + # between two sequences inside a given search space. + def matches_find(q_seq, s_seq, q_min, s_min, q_max, s_max, kmer) + matches = [] + redundant = Hash.new { |h, k| h[k] = [] } + + s_index = index_seq(s_seq, s_min, s_max, kmer) + + q_pos = q_min + + while q_pos <= q_max - kmer + 1 + q_oligo = q_seq[q_pos ... q_pos + kmer] + + s_index[q_oligo].each do |s_pos| + match = Match.new(q_pos, s_pos, kmer) + unless match_redundant?(redundant, match) + match_expand(match, q_seq, s_seq, q_min, s_min, q_max, s_max) + matches << match + + match_redundant_add(redundant, match) + end + end + + q_pos += 1 + end + + matches + end + + # Method that indexes a sequence within a given interval such that the + # index contains all oligos of a given kmer size and the positions where + # this oligo was located. + def index_seq(seq, min, max, kmer) + index_hash = Hash.new { |h, k| h[k] = [] } + + pos = min + + while pos <= max - kmer + 1 + oligo = seq[pos ... pos + kmer] + index_hash[oligo] << pos + + pos += 1 + end + + index_hash + end + + # Method to check if a match is redundant. + def match_redundant?(redundant, match) + redundant[match.q_beg].each do |s_interval| + if s_interval.include? match.s_beg and s_interval.include? match.s_end + return true + end + end + + false + end + + # Method that adds a match to the redundancy index. + def match_redundant_add(redundant, match) + (match.q_beg .. match.q_end).each do |q| + redundant[q] << (match.s_beg .. match.s_end) + end + end + + # Method that expands a match as far as possible to the left and right. + def match_expand(match, q_seq, s_seq, q_min, s_min, q_max, s_max) + match_expand_left(match, q_seq, s_seq, q_min, s_min, q_max, s_max) + match_expand_right(match, q_seq, s_seq, q_min, s_min, q_max, s_max) + + match + end + + # Method that expands a match as far as possible to the left. + def match_expand_left(match, q_seq, s_seq, q_min, s_min, q_max, s_max) + while match.q_beg > q_min and + match.s_beg > s_min and + q_seq[match.q_beg - 1] == s_seq[match.s_beg - 1] + match.q_beg -= 1 + match.s_beg -= 1 + match.length += 1 + end + + match + end + + # Method that expands a match as far as possible to the right. + def match_expand_right(match, q_seq, s_seq, q_min, s_min, q_max, s_max) + while match.q_end < q_max and + match.s_end < s_max and + q_seq[match.q_end + 1] == s_seq[match.s_end + 1] + match.length += 1 + end + + match + end + + # Class for containing a match between two sequences q and s. + class Match + attr_accessor :q_beg, :s_beg, :length, :score + + def initialize(q_beg, s_beg, length, score = 0.0) + @q_beg = q_beg + @s_beg = s_beg + @length = length + @score = score + end + + def q_end + @q_beg + @length - 1 + end + + def s_end + @s_beg + @length - 1 + end + + def to_s(seq = nil) + s = "q: #{@q_beg} #{q_end} s: #{@s_beg} #{s_end} l: #{@length} s: #{@score}" + s << " seq: #{seq[@q_beg .. q_end]}" if seq + s + end + end +end -- 2.39.5