1 # Copyright (C) 2007-2011 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
27 # Module containing code to locate nucleotide patterns in sequences allowing for
28 # ambiguity codes and a given maximum edit distance.
29 # Insertions are nucleotides found in the pattern but not in the sequence.
30 # Deletions are nucleotides found in the sequence but not in the pattern.
32 # Inspired by the paper by Bruno Woltzenlogel Paleo (page 197):
33 # http://www.logic.at/people/bruno/Papers/2007-GATE-ESSLLI.pdf
35 # ------------------------------------------------------------------------------
36 # str.patmatch(pattern[, pos[, max_edit_distance]])
38 # str.patscan(pattern[, pos[, max_edit_distance]]) { |match|
43 # ------------------------------------------------------------------------------
44 # Method to iterate through a sequence to locate the first pattern match
45 # starting from a given position and allowing for a maximum edit distance.
46 def patmatch(pattern, pos = 0, max_edit_distance = 0)
47 self.patscan(pattern, pos, max_edit_distance) do |m|
52 # ------------------------------------------------------------------------------
53 # str.patscan(pattern[, pos[, max_edit_distance]])
55 # str.patscan(pattern[, pos[, max_edit_distance]]) { |match|
60 # ------------------------------------------------------------------------------
61 # Method to iterate through a sequence to locate pattern matches starting from a
62 # given position and allowing for a maximum edit distance. Matches found in
63 # block context return the Match object. Otherwise matches are returned in an
65 def patscan(pattern, pos = 0, max_edit_distance = 0)
68 while result = match_C(self.seq, self.length, pattern, pattern.length, pos, max_edit_distance)
69 match = Match.new(*result, self.seq[result[0] ... result[0] + result[1]]);
80 return matches unless block_given?
86 # Macro for matching nucleotides including ambiguity codes.
92 #define MATCH(A,B) ((bitmap[A] & bitmap[B]) != 0)
105 # Bitmap for matching nucleotides including ambiguity codes.
106 # For each value bits are set from the left: bit pos 1 for A,
107 # bit pos 2 for T, bit pos 3 for C, and bit pos 4 for G.
110 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
111 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
112 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
113 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
114 0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0,
115 0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0,
116 0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0,
117 0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0,
118 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
119 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
122 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
125 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
130 void vector_init(score *vec, unsigned int vec_len)
134 for (i = 1; i < vec_len; i++)
143 void vector_print(score *vec, unsigned int vec_len)
147 for (i = 0; i < vec_len; i++)
149 printf("i: %d mis: %d ins: %d del: %d ed: %d\\n", i, vec[i].mis, vec[i].ins, vec[i].del, vec[i].ed);
157 int match_found(score *vec, unsigned int pat_len, unsigned int max_ed)
159 return (vec[pat_len].ed <= max_ed);
164 void vector_update(score *vec, char *seq, char *pat, unsigned int pat_len, unsigned int pos)
167 score up = {0, 0, 0, 0}; // insertion
168 score left = vec[1]; // deletion
169 score new = {0, 0, 0, 0};
173 for (i = 0; i < pat_len; i++)
175 if (MATCH(seq[pos], pat[i])) // match
181 if (left.ed <= diag.ed && left.ed <= up.ed) // deletion
186 else if (diag.ed <= up.ed && diag.ed <= left.ed) // mismatch
191 else if (up.ed <= diag.ed && up.ed <= left.ed) // insertion
198 printf("This should not happen\\n");
216 VALUE _seq, // Sequence
217 VALUE _seq_len, // Sequence length
218 VALUE _pat, // Pattern
219 VALUE _pat_len, // Pattern length
220 VALUE _pos, // Offset position
221 VALUE _max_ed // Maximum edit distance
224 char *seq = (char *) StringValuePtr(_seq);
225 char *pat = (char *) StringValuePtr(_pat);
226 unsigned int seq_len = FIX2UINT(_seq_len);
227 unsigned int pat_len = FIX2UINT(_pat_len);
228 unsigned int pos = FIX2UINT(_pos);
229 unsigned int max_ed = FIX2UINT(_max_ed);
231 score vec[MAX_PAT] = {0};
232 unsigned int vec_len = pat_len + 1;
233 unsigned int match_beg = 0;
234 unsigned int match_len = 0;
238 vector_init(vec, vec_len);
240 while (pos < seq_len)
242 vector_update(vec, seq, pat, pat_len, pos);
244 if (match_found(vec, pat_len, max_ed))
246 match_len = pat_len - vec[pat_len].ins + vec[pat_len].del;
247 match_beg = pos - match_len + 1;
249 match_ary = rb_ary_new();
250 rb_ary_push(match_ary, INT2FIX(match_beg));
251 rb_ary_push(match_ary, INT2FIX(match_len));
252 rb_ary_push(match_ary, INT2FIX(vec[pat_len].mis));
253 rb_ary_push(match_ary, INT2FIX(vec[pat_len].ins));
254 rb_ary_push(match_ary, INT2FIX(vec[pat_len].del));
262 return Qfalse; // no match
268 attr_accessor :beg, :length, :mis, :ins, :del, :match
270 def initialize(beg, length, mis, ins, del, match)