From e3d39adeb168cd8ac24e43920778457029723401 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Mon, 28 Mar 2011 14:46:16 +0000 Subject: [PATCH] added PatternMatcher to ruby code git-svn-id: http://biopieces.googlecode.com/svn/trunk@1305 74ccb610-7750-0410-82ae-013aeee3265d --- code_ruby/Maasha/lib/patternmatcher.rb | 227 +++++++++++++++++++ code_ruby/Maasha/lib/seq.rb | 83 +------ code_ruby/Maasha/test/test_patternmatcher.rb | 172 ++++++++++++++ code_ruby/Maasha/test/test_seq.rb | 29 --- 4 files changed, 403 insertions(+), 108 deletions(-) create mode 100644 code_ruby/Maasha/lib/patternmatcher.rb create mode 100755 code_ruby/Maasha/test/test_patternmatcher.rb diff --git a/code_ruby/Maasha/lib/patternmatcher.rb b/code_ruby/Maasha/lib/patternmatcher.rb new file mode 100644 index 0000000..3d79d94 --- /dev/null +++ b/code_ruby/Maasha/lib/patternmatcher.rb @@ -0,0 +1,227 @@ +# Copyright (C) 2007-2011 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This software is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# IUPAC nucleotide pair ambiguity equivalents +EQUAL = { + :AA => true, :BU => true, :TH => true, :UY => true, + :TT => true, :CB => true, :UH => true, :SC => true, + :CC => true, :GB => true, :VA => true, :SG => true, + :GG => true, :TB => true, :VC => true, :CS => true, + :UU => true, :UB => true, :VG => true, :GS => true, + :NA => true, :DA => true, :AV => true, :WA => true, + :NT => true, :DG => true, :CV => true, :WT => true, + :NC => true, :DT => true, :GV => true, :WU => true, + :NG => true, :DU => true, :KG => true, :AW => true, + :NU => true, :AD => true, :KT => true, :TW => true, + :AN => true, :GD => true, :KU => true, :UW => true, + :TN => true, :TD => true, :GK => true, :RA => true, + :CN => true, :UD => true, :TK => true, :RG => true, + :GN => true, :HA => true, :UK => true, :AR => true, + :UN => true, :HC => true, :YC => true, :GR => true, + :NN => true, :HT => true, :YT => true, :MA => true, + :BC => true, :HU => true, :YU => true, :MC => true, + :BG => true, :AH => true, :CY => true, :AM => true, + :BT => true, :CH => true, :TY => true, :CM => true, +} + +# Module containing code to locate nucleotide patterns in sequences allowing for +# ambiguity codes and a given maximum number of mismatches, insertions, and deletions. +# Insertions are nucleotides found in the pattern but not in the sequence. +# Deletions are nucleotides found in the sequence but not in the pattern. +module PatternMatcher + # ------------------------------------------------------------------------------ + # str.match(pattern[, pos[, max_mismatches[, max_insertions[, max_deletions]]]]) + # -> Match or nil + # + # ------------------------------------------------------------------------------ + # Method to locate the next pattern match starting from a given position. + # A match is located by exploring all possible paths allowing for a given + # maximum number of mismatches, insertions and deletions. If a match is + # located a Match object will be returned. If all paths are exhausted and + # no match is located the position is incremented. If no match is located + # whatsoever, then nil is returned. + # TODO: converging paths should be skipped for speed-up. + def match(pattern, pos = 0, max_mismatches = 0, max_insertions = 0, max_deletions = 0) + @pattern = pattern + @max_mismatches = max_mismatches + @max_insertions = max_insertions + @max_deletions = max_deletions + + while pos <= @seq.length - @pattern.length + @max_insertions + paths = [] + paths << Path.new(pos, seq_index = pos, pattern_index = 0) + + while not paths.empty? + new_paths = [] + + paths.each do |path| + next if path.exhausted?(@seq, @pattern) + return path.to_match if match_found?(path) + + if path.match?(@seq, @pattern) + new_paths << path.match + elsif path.mismatches < max_mismatches + new_paths << path.mismatch + end + + new_paths << path.insertion if path.insertions < max_insertions + new_paths << path.deletion if path.deletions < max_deletions + end + + paths = new_paths + end + + pos += 1 + end + end + + # ------------------------------------------------------------------------------ + # str.scan(pattern[, pos[, max_mismatches[, max_insertions[, max_deletions]]]]) + # -> Array + # str.scan(pattern[, pos[, max_mismatches[, max_insertions[, max_deletions]]]]) { |match| + # block + # } + # -> Match + # + # ------------------------------------------------------------------------------ + # Method to iterate through a sequence to locate pattern matches starting + # from a given position. A match is located by exploring all possible paths + # allowing for a given maximum number of mismatches, insertions and deletions. + # Matches found in block context return the Match object. Otherwise matches are + # returned in an Array. + def scan(pattern, pos = 0, max_mismatches = 0, max_insertions = 0, max_deletions = 0) + matches = [] + offset = pos + + while match = match(pattern, offset, max_mismatches, max_insertions, max_deletions) + if block_given? + yield match + else + matches << match + end + + offset = match.pos + 1 + end + + return matches unless block_given? + end + + private + + # Method to check if a path is complete and a match was found. + def match_found?(path) + if path.mismatches <= @max_mismatches and path.insertions <= @max_insertions and path.deletions <= @max_deletions + if path.matches + path.mismatches + path.insertions >= @pattern.length + true + end + end + end + + # Class for describing a path for matching a nucleotide sequence and a pattern. + class Path + attr_accessor :pos, :seq_index, :pattern_index, :matches, :mismatches, :insertions, :deletions, :length + + def initialize(pos, seq_index, pattern_index, matches = 0, mismatches = 0, insertions = 0, deletions = 0, length = 0) + @pos = pos + @seq_index = seq_index + @pattern_index = pattern_index + @matches = matches + @mismatches = mismatches + @insertions = insertions + @deletions = deletions + @length = length + end + + # Method to check if nucleotides match. + def match?(seq, pattern) + EQUAL["#{seq[self.seq_index]}#{pattern[self.pattern_index]}".upcase.to_sym] + end + + # Method to check if the path is exhausted. + def exhausted?(seq, pattern) + if self.seq_index - self.insertions > seq.length + true + elsif self.pattern_index > pattern.length + true + end + end + + # Method that returns a Match object created from a Path object. + def to_match + Match.new(@pos, @matches, @mismatches, @insertions, @deletions, @length) + end + + # Method that returns a new Match object for a matching path + def match + path_match = self.dup + path_match.length += 1 + path_match.matches += 1 + path_match.seq_index += 1 + path_match.pattern_index += 1 + path_match + end + + # Method that returns a new Match object for a matching path + def mismatch + path_mismatche = self.dup + path_mismatche.length += 1 + path_mismatche.mismatches += 1 + path_mismatche.seq_index += 1 + path_mismatche.pattern_index += 1 + path_mismatche + end + + # Method that returns a new Match object for a insertion path + def insertion + path_insertion = self.dup + path_insertion.insertions += 1 + path_insertion.pattern_index += 1 + path_insertion + end + + # Method that returns a new Match object for a deletion path + def deletion + path_deletion = self.dup + path_deletion.length += 1 + path_deletion.deletions += 1 + path_deletion.seq_index += 1 + path_deletion + end + end + + # Class for creating Match objects which contain the description of a + # match between a nucleotide sequence and a pattern. + class Match + attr_reader :pos, :matches, :mismatches, :insertions, :deletions, :length + + def initialize(pos, matches, mismatches, insertions, deletions, length) + @pos = pos + @matches = matches + @mismatches = mismatches + @insertions = insertions + @deletions = deletions + @length = length + end + end +end diff --git a/code_ruby/Maasha/lib/seq.rb b/code_ruby/Maasha/lib/seq.rb index a922fc7..30b4c59 100644 --- a/code_ruby/Maasha/lib/seq.rb +++ b/code_ruby/Maasha/lib/seq.rb @@ -24,7 +24,7 @@ require 'amatch' require 'digest' -require 'narray' +require 'patternmatcher' # Residue alphabets DNA = %w[a t c g] @@ -36,34 +36,12 @@ INDELS = %w[. - _ ~] SCORE_PHRED = 33 SCORE_ILLUMINA = 64 -# IUPAC nucleotide pair ambiguity equivalents -EQUAL = { - :AA => true, :BU => true, :TH => true, :UY => true, - :TT => true, :CB => true, :UH => true, :SC => true, - :CC => true, :GB => true, :VA => true, :SG => true, - :GG => true, :TB => true, :VC => true, :CS => true, - :UU => true, :UB => true, :VG => true, :GS => true, - :NA => true, :DA => true, :AV => true, :WA => true, - :NT => true, :DG => true, :CV => true, :WT => true, - :NC => true, :DT => true, :GV => true, :WU => true, - :NG => true, :DU => true, :KG => true, :AW => true, - :NU => true, :AD => true, :KT => true, :TW => true, - :AN => true, :GD => true, :KU => true, :UW => true, - :TN => true, :TD => true, :GK => true, :RA => true, - :CN => true, :UD => true, :TK => true, :RG => true, - :GN => true, :HA => true, :UK => true, :AR => true, - :UN => true, :HC => true, :YC => true, :GR => true, - :NN => true, :HT => true, :YT => true, :MA => true, - :BC => true, :HU => true, :YU => true, :MC => true, - :BG => true, :AH => true, :CY => true, :AM => true, - :BT => true, :CH => true, :TY => true, :CM => true, -} - # Error class for all exceptions to do with Seq. class SeqError < StandardError; end class Seq include Amatch + include PatternMatcher attr_accessor :seq_name, :seq, :type, :qual @@ -428,61 +406,6 @@ class Seq end end - # ------------------------------------------------------------------------------ - # seq.match(pattern[, pos ] [, hd=max] [, ed=max]) -> matchdata or nil - # ------------------------------------------------------------------------------ - # Method to locate a pattern in a sequence and return the position of the match - # or nil if no match was found. Hamming or Edit distance may be specified. - def match(pattern, pos = 0, hd = 0, ed = 0) - while pos < self.length - pattern.length + 1 - if ed == 0 - match = 0 - mis = 0 - - for i in 0 ... pattern.length do - EQUAL[(self.seq[pos + i].upcase + pattern[i].upcase).to_sym] ? match += 1 : mis += 1 - - break if mis > hd - end - - if pattern.length - hd == match - return pos - else - pos += (match - hd >= 0) ? 1 + match - hd : 1 - end - else - str1 = self.seq[pos ... pos + pattern.length] - str2 = pattern - - rows = str1.length + 1 - cols = str2.length + 1 - - matrix = NArray.int(rows, cols) - - for i in 0 ... rows do matrix[i, 0] = i end - for j in 0 ... cols do matrix[0, j] = j end - - for j in 1 ... cols do - for i in 1 ... rows do - if EQUAL[(str1[i - 1].upcase + str2[j - 1].upcase).to_sym] - matrix[i, j] = matrix[i - 1, j - 1] - else - del = matrix[i - 1, j] + 1 - ins = matrix[i, j - 1] + 1 - mis = matrix[i - 1, j - 1] + 1 - - matrix[i, j] = [del, ins, mis].min - end - end - end - - return pos if matrix[rows - 1, cols - 1] <= ed - - pos += 1 - end - end - end - private # Method to convert a Solexa score (odd ratio) to @@ -499,3 +422,5 @@ class Seq (score_phred + 64).chr end end + +__END__ diff --git a/code_ruby/Maasha/test/test_patternmatcher.rb b/code_ruby/Maasha/test/test_patternmatcher.rb new file mode 100755 index 0000000..e6d5474 --- /dev/null +++ b/code_ruby/Maasha/test/test_patternmatcher.rb @@ -0,0 +1,172 @@ +#!/usr/bin/env ruby +$:.unshift File.join(File.dirname(__FILE__),'..','lib') + +# Copyright (C) 2007-2010 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This software is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +require 'seq' +require 'test/unit' +require 'pp' + +class TestPatternMatcher < Test::Unit::TestCase + def setup + @entry = Seq.new("test", "atcg") + end + + def test_PatternMatcher_match_with_perfect_match_returns_ok + assert_equal(4, @entry.match("atcg").matches) + assert_equal(2, @entry.match("cg").matches) + end + + def test_PatternMatcher_match_with_perfect_match_with_ambiguity_returns_ok + assert_equal(4, @entry.match("aNcg").matches) + end + + def test_PatternMatcher_match_with_fail_match_returns_nil + assert_nil(@entry.match("gggg")) + end + + def test_PatternMatcher_match_with_one_mismatch_with_zero_allowed_returns_nil + assert_nil(@entry.match("aAcg")) + end + + def test_PatternMatcher_match_with_one_mismatch_with_one_allowed_returns_ok + assert_equal(1, @entry.match("aGcg", pos = 0, mismatches = 1).mismatches) + end + + def test_PatternMatcher_match_with_two_mismatch_with_one_allowed_returns_nil + assert_nil(@entry.match("CtcA", pos = 0, mismatches = 1)) + end + + def test_PatternMatcher_match_with_two_mismatch_with_two_allowed_returns_ok + assert_equal(2, @entry.match("CtcA", pos = 0, mismatches = 2).mismatches) + end + + def test_PatternMatcher_match_with_one_insertion_with_zero_allowed_returns_nil + assert_nil(@entry.match("atTcg", pos = 0, mismatches = 0, insertions = 0)) + assert_nil(@entry.match("Tatcg", pos = 0, mismatches = 0, insertions = 0)) + assert_nil(@entry.match("atcgT", pos = 0, mismatches = 0, insertions = 0)) + end + + def test_PatternMatcher_match_with_one_insertion_with_one_allowed_returns_ok + assert_equal(1, @entry.match("atTcg", pos = 0, mismatches = 0, insertions = 1).insertions) + end + + def test_PatternMatcher_match_with_two_insertion_with_one_allowed_returns_nil + assert_nil(@entry.match("aCCtcg", pos = 0, mismatches = 0, insertions = 1)) + assert_nil(@entry.match("CCatcg", pos = 0, mismatches = 0, insertions = 1)) + assert_nil(@entry.match("atcgCC", pos = 0, mismatches = 0, insertions = 1)) + assert_nil(@entry.match("CatcgC", pos = 0, mismatches = 0, insertions = 1)) + end + + def test_PatternMatcher_match_with_two_insertion_with_two_allowed_returns_ok + assert_equal(2, @entry.match("aCCtcg", pos = 0, mismatches = 0, insertions = 2).insertions) + assert_equal(2, @entry.match("CCatcg", pos = 0, mismatches = 0, insertions = 2).insertions) + assert_equal(2, @entry.match("atcgCC", pos = 0, mismatches = 0, insertions = 2).insertions) + assert_equal(2, @entry.match("CatcgC", pos = 0, mismatches = 0, insertions = 2).insertions) + end + + def test_PatternMatcher_match_with_one_deletion_with_zero_allowed_returns_nil + assert_nil(@entry.match("acg")) + assert_nil(@entry.match("atg")) + end + + def test_PatternMatcher_match_with_one_deletion_with_one_allowed_returns_ok + assert_equal(1, @entry.match("tcg", pos = 0, mismatchses = 0, insertions = 0, deletions = 1).deletions) + assert_equal(1, @entry.match("acg", pos = 0, mismatchses = 0, insertions = 0, deletions = 1).deletions) + assert_equal(1, @entry.match("atg", pos = 0, mismatchses = 0, insertions = 0, deletions = 1).deletions) + end + + def test_PatternMatcher_match_with_two_deletion_with_one_allowed_returns_nil + assert_nil(@entry.match("ag", pos = 0, mismatchses = 0, insertions = 0, deletions = 1)) + end + + def test_PatternMatcher_match_with_two_deletion_with_two_allowed_returns_ok + assert_equal(2, @entry.match("cg", pos = 0, mismatchses = 0, insertions = 0, deletions = 2).deletions) + assert_equal(2, @entry.match("tg", pos = 0, mismatchses = 0, insertions = 0, deletions = 2).deletions) + assert_equal(2, @entry.match("ag", pos = 0, mismatchses = 0, insertions = 0, deletions = 2).deletions) + end + + def test_PatternMatcher_match_with_one_mismatch_one_insertions_one_deletion_returns_ok + assert_equal(1, @entry.match("ggtg", pos = 0, mismatchses = 1, insertions = 1, deletions = 1).mismatches) + assert_equal(1, @entry.match("ggtg", pos = 0, mismatchses = 1, insertions = 1, deletions = 1).insertions) + assert_equal(1, @entry.match("ggtg", pos = 0, mismatchses = 1, insertions = 1, deletions = 1).deletions) + end + + # atcgagctagctagctagctgactac + # ax + # t x + # g i + # g i + # c x + # g x + # + # at--cg + # || || + # atggcg + def test_PatternMatcher_match_with_two_insertions_and_two_allowed_returns_ok + entry = Seq.new("test", "atcgagctagctagctagctgactac") + assert_equal(2, entry.match("atggcg", pos = 0, mismatchses = 0, insertions = 2, deletions = 0).insertions) + end + + # atggcgagctagctagctagctgactac + # ax + # t xdd + # c x + # g x + # + # atggcg + # || || + # at--cg + def test_PatternMatcher_match_with_two_deletions_and_two_allowed_returns_ok + entry = Seq.new("test", "atggcgagctagctagctagctgactac") + assert_equal(2, entry.match("atcg", pos = 0, mismatchses = 0, insertions = 0, deletions = 2).deletions) + end + + # ataacgagctagctagctagctgactac + # ax + # gi + # t xdd + # c x + # g x + # + # a-taacg + # | | || + # agt--cg + def test_PatternMatcher_match_with_one_insertions_and_two_deletions_all_allowed_returns_ok + entry = Seq.new("test", "ataacgagctagctagctagctgactac") + assert_equal(1, entry.match("agtcg", pos = 0, mismatchses = 0, insertions = 1, deletions = 2).insertions) + assert_equal(2, entry.match("agtcg", pos = 0, mismatchses = 0, insertions = 1, deletions = 2).deletions) + end + + def test_Pattern_Matcher_scan_locates_three_patterns_ok + entry = Seq.new("test", "ataacgagctagctagctagctgactac") + assert_equal(3, entry.scan("tag").count) + end + + def test_Pattern_Matcher_scan_with_pos_locates_two_patterns_ok + entry = Seq.new("test", "ataacgagctagctagctagctgactac") + assert_equal(2, entry.scan("tag", 10).count) + end +end diff --git a/code_ruby/Maasha/test/test_seq.rb b/code_ruby/Maasha/test/test_seq.rb index 0e986f6..5caed92 100755 --- a/code_ruby/Maasha/test/test_seq.rb +++ b/code_ruby/Maasha/test/test_seq.rb @@ -441,35 +441,6 @@ class TestSeq < Test::Unit::TestCase @entry.adaptor_clip_left("cgax", 25) assert_equal( "efghi", @entry.qual) end - - def test_Seq_match_with_no_match_returns_nil - @entry.seq = "atcg" - assert_equal(nil, @entry.match("ttt")) - end - - def test_Seq_match_returns_correctly - @entry.seq = "atcgatct" - assert_equal(0, @entry.match("aTc")) - assert_equal(4, @entry.match("aNct")) - end - - def test_Seq_match_with_pos_returns_correctly - @entry.seq = "atcatc" - assert_equal(3, @entry.match("aTc", 2)) - end - - def test_Seq_match_with_hamming_dist_returns_correctly - @entry.seq = "atcg" - assert_equal(0, @entry.match("XTCG", pos=0, hd=1)) - assert_equal(0, @entry.match("TTCX", pos=0, hd=2)) - assert_equal(0, @entry.match("XXCX", pos=0, hd=3)) - assert_equal(0, @entry.match("XXXX", pos=0, hd=4)) - end - - def test_Seq_match_with_pos_and_hamming_dist_returns_correctly - @entry.seq = "atcgttcg" - assert_equal(4, @entry.match("ATCG", pos=1, hd=1)) - end end -- 2.39.5