1 # Copyright (C) 2007-2011 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
27 # Module containing code to locate nucleotide patterns in sequences allowing for
28 # ambiguity codes and a given maximum edit distance.
29 # Insertions are nucleotides found in the pattern but not in the sequence.
30 # Deletions are nucleotides found in the sequence but not in the pattern.
32 # Inspired by the paper by Bruno Woltzenlogel Paleo (page 197):
33 # http://www.logic.at/people/bruno/Papers/2007-GATE-ESSLLI.pdf
35 # ------------------------------------------------------------------------------
36 # str.patmatch(pattern[, pos[, max_edit_distance]])
38 # str.patscan(pattern[, pos[, max_edit_distance]]) { |match|
43 # ------------------------------------------------------------------------------
44 # Method to iterate through a sequence to locate pattern matches starting from a
45 # given position and allowing for a maximum edit distance. Matches found in
46 # block context return the Match object. Otherwise matches are returned in an
48 def patmatch(pattern, pos = 0, max_edit_distance = 0)
49 self.patscan(pattern, pos, max_edit_distance) do |m|
54 # ------------------------------------------------------------------------------
55 # str.patscan(pattern[, pos[, max_edit_distance]])
57 # str.patscan(pattern[, pos[, max_edit_distance]]) { |match|
62 # ------------------------------------------------------------------------------
63 # Method to iterate through a sequence to locate pattern matches starting from a
64 # given position and allowing for a maximum edit distance. Matches found in
65 # block context return the Match object. Otherwise matches are returned in an
67 def patscan(pattern, pos = 0, max_edit_distance = 0)
70 while result = match_C(self.seq, self.length, pattern, pattern.length, pos, max_edit_distance)
71 match = Match.new(*result, self.seq[result[0] ... result[0] + result[1]]);
82 return matches unless block_given?
88 # Macro for matching nucleotides including ambiguity codes.
94 #define MATCH(A,B) ((bitmap[A] & bitmap[B]) != 0)
107 # Bitmap for matching nucleotides including ambiguity codes.
108 # For each value bits are set from the left: bit pos 1 for A,
109 # bit pos 2 for T, bit pos 3 for C, and bit pos 4 for G.
112 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
113 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
114 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
115 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
116 0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0,
117 0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0,
118 0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0,
119 0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0,
120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
122 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
125 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
126 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
127 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
132 void vector_init(score *vec, unsigned int vec_len)
136 for (i = 1; i < vec_len; i++)
145 void vector_print(score *vec, unsigned int vec_len)
149 for (i = 0; i < vec_len; i++)
151 printf("i: %d mis: %d ins: %d del: %d ed: %d\\n", i, vec[i].mis, vec[i].ins, vec[i].del, vec[i].ed);
159 int match_found(score *vec, unsigned int pat_len, unsigned int max_ed)
161 return (vec[pat_len].ed <= max_ed);
166 void vector_update(score *vec, char *seq, char *pat, unsigned int pat_len, unsigned int pos)
169 score up = {0, 0, 0, 0}; // insertion
170 score left = vec[1]; // deletion
171 score new = {0, 0, 0, 0};
175 for (i = 0; i < pat_len; i++)
177 if (MATCH(seq[pos], pat[i])) // match
183 if (left.ed <= diag.ed && left.ed <= up.ed) // deletion
188 else if (diag.ed <= up.ed && diag.ed <= left.ed) // mismatch
193 else if (up.ed <= diag.ed && up.ed <= left.ed) // insertion
200 printf("This should not happen\\n");
218 VALUE _seq, // Sequence
219 VALUE _seq_len, // Sequence length
220 VALUE _pat, // Pattern
221 VALUE _pat_len, // Pattern length
222 VALUE _pos, // Offset position
223 VALUE _max_ed // Maximum edit distance
226 char *seq = (char *) StringValuePtr(_seq);
227 char *pat = (char *) StringValuePtr(_pat);
228 unsigned int seq_len = FIX2UINT(_seq_len);
229 unsigned int pat_len = FIX2UINT(_pat_len);
230 unsigned int pos = FIX2UINT(_pos);
231 unsigned int max_ed = FIX2UINT(_max_ed);
233 score vec[MAX_PAT] = {0};
234 unsigned int vec_len = pat_len + 1;
235 unsigned int match_beg = 0;
236 unsigned int match_len = 0;
240 vector_init(vec, vec_len);
242 while (pos < seq_len)
244 vector_update(vec, seq, pat, pat_len, pos);
246 if (match_found(vec, pat_len, max_ed))
248 match_len = pat_len - vec[pat_len].ins + vec[pat_len].del;
249 match_beg = pos - match_len + 1;
251 match_ary = rb_ary_new();
252 rb_ary_push(match_ary, INT2FIX(match_beg));
253 rb_ary_push(match_ary, INT2FIX(match_len));
254 rb_ary_push(match_ary, INT2FIX(vec[pat_len].mis));
255 rb_ary_push(match_ary, INT2FIX(vec[pat_len].ins));
256 rb_ary_push(match_ary, INT2FIX(vec[pat_len].del));
264 return Qfalse; // no match
270 attr_accessor :beg, :length, :mis, :ins, :del, :match
272 def initialize(beg, length, mis, ins, del, match)