From 1d54595c1543846e648795c6fbea809e6806057c Mon Sep 17 00:00:00 2001 From: martinahansen Date: Tue, 10 Jul 2012 13:37:10 +0000 Subject: [PATCH] added align/pair.rb to ruby trunk git-svn-id: http://biopieces.googlecode.com/svn/trunk@1864 74ccb610-7750-0410-82ae-013aeee3265d --- code_ruby/lib/maasha/align/pair.rb | 327 +++++++++++++++++++++++++++++ 1 file changed, 327 insertions(+) create mode 100644 code_ruby/lib/maasha/align/pair.rb diff --git a/code_ruby/lib/maasha/align/pair.rb b/code_ruby/lib/maasha/align/pair.rb new file mode 100644 index 0000000..45e9298 --- /dev/null +++ b/code_ruby/lib/maasha/align/pair.rb @@ -0,0 +1,327 @@ +# Copyright (C) 2007-2012 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This software is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Extending Math module with a couple of useful methods. +module Math + # Method for calculating the distance between a point and a line. + def self.dist_point2line(px, py, x1, y1, x2, y2) + a = (y2.to_f - y1) / (x2.to_f - x1) + + b = y1 - a * x1 + + (a * px + b - py ).abs / Math.sqrt(a ** 2 + 1 ) + end + + # Method for calculating the distance between two points. + def self.dist_point2point(x1, y1, x2, y2) + Math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2) + end +end + +# Module with stuff to create a pairwise aligment. +module PairAlign + # Class for creating a pairwise alignment. + class AlignPair + # Class method to create a pairwise alignment of two given Seq objects. + def self.align(q_entry, s_entry) + self.new(q_entry, s_entry) + end + + # Method to inialize a pairwise alignment given two Seq objects. + def initialize(q_entry, s_entry) + @q_entry = q_entry + @s_entry = s_entry + @matches = [] + + @q_entry.seq.downcase! + @s_entry.seq.downcase! + + align_recurse(@q_entry.seq, @s_entry.seq, 0, 0, @q_entry.length - 1, @s_entry.length - 1, 16, []) + matches_upcase + gaps_insert + end + + private + + # Method that creates an alignment by chaining matches, which are + # subsequences shared between two sequences. This recursive method + # functions by considering only matches within a given search space. If no + # matches are given these will be located and matches will be included + # depending on a calculated score. New search spaces spanning the spaces + # between the best scoring matches and the search space boundaries will be + # cast and recursed into. + def align_recurse(q_seq, s_seq, q_min, s_min, q_max, s_max, kmer, matches) + matches = matches_select_by_space(matches, q_min, s_min, q_max, s_max) + + while (matches.size == 0 and kmer > 0) + matches = matches_find(q_seq, s_seq, q_min, s_min, q_max, s_max, kmer) + kmer /= 2 + end + + matches = matches_select_by_score(matches, q_min, s_min, q_max, s_max) + + if best_match = matches.pop + @matches << best_match + + l_q_min = q_min + l_s_min = s_min + l_q_max = best_match.q_beg - 1 + l_s_max = best_match.s_beg - 1 + + r_q_min = best_match.q_end + 1 + r_s_min = best_match.s_end + 1 + r_q_max = q_max + r_s_max = s_max + + if l_q_max - l_q_min > 0 and l_s_max - l_s_min > 0 + align_recurse(q_seq, s_seq, l_q_min, l_s_min, l_q_max, l_s_max, kmer, matches) + end + + if r_q_max - r_q_min > 0 and r_s_max - r_s_min > 0 + align_recurse(q_seq, s_seq, r_q_min, r_s_min, r_q_max, r_s_max, kmer, matches) + end + end + end + + # Method to select matches that lies within the search space. + def matches_select_by_space(matches, q_min, s_min, q_max, s_max) + new_matches = matches.select do |match| + match.q_beg >= q_min and + match.s_beg >= s_min and + match.q_end <= q_max and + match.s_end <= s_max + end + + new_matches + end + + # Method to select the best scoring matches and return these sorted + # according to score. + def matches_select_by_score(matches, q_min, s_min, q_max, s_max) + new_matches = [] + + matches.each do |match| + score_length = match_score_by_length(match) + score_diag = match_score_by_diagonal_dist(match, q_min, s_min, q_max, s_max) + + match.score = score_length - score_diag + + new_matches << match if match.score > 0 + end + + new_matches.sort_by! { |match| match.score } + + new_matches + end + + # Method to calculate score based on match length. + def match_score_by_length(match) + match.length.to_f + end + + # Method to calculate score based on the distance to the closest + # diagonal. The smaller the distance the better the score. + def match_score_by_diagonal_dist(match, q_min, s_min, q_max, s_max) + q_dim = q_max - q_min + 1 + s_dim = s_max - s_min + 1 + + if q_dim >= s_dim # s_dim is the narrow end + beg_dist = Math.dist_point2line(match.q_beg, match.s_beg, q_min, s_min, q_min + s_dim, s_min + s_dim) + end_dist = Math.dist_point2line(match.q_beg, match.s_beg, q_max - s_dim, s_max - s_dim, q_max, s_max) + else + beg_dist = Math.dist_point2line(match.q_beg, match.s_beg, q_min, s_min, q_min + q_dim, s_min + q_dim) + end_dist = Math.dist_point2line(match.q_beg, match.s_beg, q_max - q_dim, s_max - q_dim, q_max, s_max) + end + + min_dist = (beg_dist < end_dist) ? beg_dist : end_dist + min_dist.to_f + end + + # Method that finds all maximally expanded non-redundant matches shared + # between two sequences inside a given search space. + def matches_find(q_seq, s_seq, q_min, s_min, q_max, s_max, kmer) + matches = [] + redundant = Hash.new { |h, k| h[k] = [] } + + s_index = index_seq(s_seq, s_min, s_max, kmer) + + q_pos = q_min + + while q_pos <= q_max - kmer + 1 + q_oligo = q_seq[q_pos ... q_pos + kmer] + + s_index[q_oligo].each do |s_pos| + match = Match.new(q_pos, s_pos, kmer) + + unless match_redundant?(redundant, match) + match_expand(match, q_seq, s_seq, q_min, s_min, q_max, s_max) + matches << match + + match_redundant_add(redundant, match) + end + end + + q_pos += 1 + end + + matches + end + + # Method that indexes a seuquence within a given interval such that the + # index contains all oligos of a given kmer size and the positions where + # this oligo was located. + def index_seq(seq, min, max, kmer) + index_hash = Hash.new { |h, k| h[k] = [] } + + pos = min + + while pos <= max - kmer + 1 + oligo = seq[pos ... pos + kmer] + index_hash[oligo] << pos + + pos += 1 + end + + index_hash + end + + # Method to check if a match is redundant. + def match_redundant?(redundant, match) + redundant[match.q_beg].each do |s_interval| + if s_interval.include? match.s_beg and s_interval.include? match.s_end + return true + end + end + + false + end + + # Method that adds a match to the redundancy index. + def match_redundant_add(redundant, match) + (match.q_beg .. match.q_end).each do |q| + redundant[q] << (match.s_beg .. match.s_end) + end + end + + # Method that expands a match as far as possible to the left and right. + def match_expand(match, q_seq, s_seq, q_min, s_min, q_max, s_max) + match_expand_left(match, q_seq, s_seq, q_min, s_min) + match_expand_right(match, q_seq, s_seq, q_max, s_max) + + match + end + + # Method that expands a match as far as possible to the left. + def match_expand_left(match, q_seq, s_seq, q_min, s_min) + while match.q_beg > q_min and + match.s_beg > s_min and + q_seq[match.q_beg - 1] == s_seq[match.s_beg - 1] + match.q_beg -= 1 + match.s_beg -= 1 + match.length += 1 + end + + match + end + + # Method that expands a match as far as possible to the right. + def match_expand_right(match, q_seq, s_seq, q_max, s_max) + while match.q_end < q_max and + match.s_end < s_max and + q_seq[match.q_end + 1] == s_seq[match.s_end + 1] + match.length += 1 + end + + match + end + + # Method for debugging purposes that upcase matching sequence while non-matches + # sequence is kept in lower case. + def matches_upcase + @matches.each do |match| + @q_entry.seq[match.q_beg .. match.q_end] = @q_entry.seq[match.q_beg .. match.q_end].upcase + @s_entry.seq[match.s_beg .. match.s_end] = @s_entry.seq[match.s_beg .. match.s_end].upcase + end + end + + # Method that insert gaps in sequences based on a list of matches and thus + # creating an alignment. + # TODO check boundaries! + def gaps_insert + @matches.sort_by! { |m| m.q_beg } + + q_gaps = 0 + s_gaps = 0 + + @matches.each do |match| + diff = (q_gaps + match.q_beg) - (s_gaps + match.s_beg) + + #pp "q_gaps #{q_gaps} s_gaps #{s_gaps} diff #{diff}" + + if diff < 0 + @q_entry.seq.insert(match.q_beg + q_gaps, "-" * diff.abs) + q_gaps += diff.abs + elsif diff > 0 + @s_entry.seq.insert(match.s_beg + s_gaps, "-" * diff.abs) + s_gaps += diff.abs + end + end + + diff = @q_entry.length - @s_entry.length + + if diff < 0 + @q_entry.seq << ("-" * diff.abs) + else + @s_entry.seq << ("-" * diff.abs) + end + end + end + + # Class for containing a match between two sequences q and s. + class Match + attr_accessor :q_beg, :s_beg, :length, :score + + def initialize(q_beg, s_beg, length, score = 0.0) + @q_beg = q_beg + @s_beg = s_beg + @length = length + @score = score + end + + def q_end + @q_beg + @length - 1 + end + + def s_end + @s_beg + @length - 1 + end + + def to_s(seq = nil) + s = "q: #{@q_beg} #{q_end} s: #{@s_beg} #{s_end} l: #{@length} s: #{@score}" + s << " seq: #{seq[@q_beg .. q_end]}" if seq + s + end + end +end + -- 2.39.5