1 # Copyright (C) 2007-2011 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 require 'inline' # RubyInline
27 # Error class for all exceptions to do with BackTrack.
28 class BackTrackError < StandardError; end
30 # Module containing code to locate nucleotide patterns in sequences allowing for
31 # ambiguity codes and a given maximum mismatches, insertions, and deletions. The
32 # pattern match engine is based on a backtrack algorithm.
33 # Insertions are nucleotides found in the pattern but not in the sequence.
34 # Deletions are nucleotides found in the sequence but not in the pattern.
35 # Algorithm based on code kindly provided by j_random_hacker @ Stackoverflow.
37 OK_PATTERN = Regexp.new('^[flsycwphqrimtnkvadegu]+$')
38 MAX_MIS = 5 # Maximum number of mismatches allowed
39 MAX_INS = 5 # Maximum number of insertions allowed
40 MAX_DEL = 5 # Maximum number of deletions allowed
42 # ------------------------------------------------------------------------------
43 # str.scan(pattern[, max_mismatches [, max_insertions [,max_deletions]]])
45 # str.scan(pattern[, max_mismatches [, max_insertions [,max_deletions]]]) { |match|
50 # ------------------------------------------------------------------------------
51 # Method to iterate through a sequence to locate pattern matches starting at a
52 # given offset allowing for a maximum number of mismatches, insertions, and
53 # deletions. Insertions are nucleotides found in the pattern but not in the
54 # sequence. Deletions are nucleotides found in the sequence but not in the
55 # pattern. Matches found in block context return the Match object. Otherwise
56 # matches are returned in an Array of Match objects.
57 def patscan(pattern, offset = 0, max_mismatches = 0, max_insertions = 0, max_deletions = 0)
58 raise BackTrackError, "Bad pattern: #{pattern}" unless pattern.downcase =~ OK_PATTERN
59 raise BackTrackError, "offset: #{offset} out of range (0 ... #{self.length - 1})" unless (0 ... self.length).include? offset
60 raise BackTrackError, "max_mismatches: #{max_mismatches} out of range (0 .. #{MAX_MIS})" unless (0 .. MAX_MIS).include? max_mismatches
61 raise BackTrackError, "max_insertions: #{max_insertions} out of range (0 .. #{MAX_INS})" unless (0 .. MAX_INS).include? max_insertions
62 raise BackTrackError, "max_deletions: #{max_deletions} out of range (0 .. #{MAX_DEL})" unless (0 .. MAX_DEL).include? max_deletions
67 while result = self.track(pattern, offset, max_mismatches, max_insertions, max_deletions)
68 yield Match.new(result.first, result.last, self.seq[result.first ... result.first + result.last])
69 offset = result.first + 1
72 while result = self.track(pattern, offset, max_mismatches, max_insertions, max_deletions)
73 matches << Match.new(result.first, result.last, self.seq[result.first ... result.first + result.last])
74 offset = result.first + 1
78 return matches.empty? ? nil : matches unless block_given?
84 builder.add_static "id_seq", 'rb_intern("@seq")', "ID"
87 #define MATCH(A,B) ((bitmap[(unsigned short) A] & bitmap[(unsigned short) B]) != 0)
91 unsigned short bitmap[256] = {
92 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
93 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
94 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
95 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
96 0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0,
97 0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0,
98 0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0,
99 0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0,
100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
105 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
106 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
107 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
111 # ss is the start of the string, used only for reporting the match endpoints.
113 unsigned int backtrack(char* ss, char* s, char* p, int mm, int ins, int del)
117 while (*s && MATCH(*s, *p)) ++s, ++p; // OK to always match longest segment
120 return (unsigned int) (s - ss);
123 if (mm && *s && *p && (r = backtrack(ss, s + 1, p + 1, mm - 1, ins, del))) return r;
124 if (ins && *p && (r = backtrack(ss, s, p + 1, mm, ins - 1, del))) return r;
125 if (del && *s && (r = backtrack(ss, s + 1, p, mm, ins, del - 1))) return r;
132 # Find all occurrences of p starting at pos in s, with at most
133 # mm mismatches, ins insertions and del deletions.
135 static VALUE track(char* p, unsigned int pos, int mm, int ins, int del)
137 VALUE seq = rb_ivar_get(self, id_seq);
138 char* s = StringValuePtr(seq);
149 e = backtrack(ss, s, p, mm, ins, del);
153 tuple = rb_ary_new();
154 rb_ary_push(tuple, INT2FIX((int) (s - ss)));
155 rb_ary_push(tuple, INT2FIX((int) e - (s - ss)));
169 attr_reader :pos, :length, :match
171 def initialize(pos, length, match)
178 "#{pos}:#{length}:#{match}"