# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-# IUPAC nucleotide pair ambiguity equivalents are saved in an
-# array of bit fields.
-
-BIT_A = 1 << 0
-BIT_T = 1 << 1
-BIT_C = 1 << 2
-BIT_G = 1 << 3
-
-EQUAL = Array.new(256, 0)
-EQUAL['A'.ord] = BIT_A
-EQUAL['a'.ord] = BIT_A
-EQUAL['T'.ord] = BIT_T
-EQUAL['t'.ord] = BIT_T
-EQUAL['U'.ord] = BIT_T
-EQUAL['u'.ord] = BIT_T
-EQUAL['C'.ord] = BIT_C
-EQUAL['c'.ord] = BIT_C
-EQUAL['G'.ord] = BIT_G
-EQUAL['g'.ord] = BIT_G
-EQUAL['M'.ord] = (BIT_A|BIT_C)
-EQUAL['m'.ord] = (BIT_A|BIT_C)
-EQUAL['R'.ord] = (BIT_A|BIT_G)
-EQUAL['r'.ord] = (BIT_A|BIT_G)
-EQUAL['W'.ord] = (BIT_A|BIT_T)
-EQUAL['w'.ord] = (BIT_A|BIT_T)
-EQUAL['S'.ord] = (BIT_C|BIT_G)
-EQUAL['s'.ord] = (BIT_C|BIT_G)
-EQUAL['Y'.ord] = (BIT_C|BIT_T)
-EQUAL['y'.ord] = (BIT_C|BIT_T)
-EQUAL['K'.ord] = (BIT_G|BIT_T)
-EQUAL['k'.ord] = (BIT_G|BIT_T)
-EQUAL['B'.ord] = (BIT_C|BIT_G|BIT_T)
-EQUAL['b'.ord] = (BIT_C|BIT_G|BIT_T)
-EQUAL['D'.ord] = (BIT_A|BIT_G|BIT_T)
-EQUAL['d'.ord] = (BIT_A|BIT_G|BIT_T)
-EQUAL['H'.ord] = (BIT_A|BIT_C|BIT_T)
-EQUAL['h'.ord] = (BIT_A|BIT_C|BIT_T)
-EQUAL['V'.ord] = (BIT_A|BIT_C|BIT_G)
-EQUAL['v'.ord] = (BIT_A|BIT_C|BIT_G)
-EQUAL['N'.ord] = (BIT_A|BIT_C|BIT_G|BIT_T)
-EQUAL['n'.ord] = (BIT_A|BIT_C|BIT_G|BIT_T)
-
-# Module containing code to locate nucleotide patterns in sequences allowing for
-# ambiguity codes and a given maximum edit distance.
-# Insertions are nucleotides found in the pattern but not in the sequence.
-# Deletions are nucleotides found in the sequence but not in the pattern.
-#
-# Inspired by the paper by Bruno Woltzenlogel Paleo (page 197):
-# http://www.logic.at/people/bruno/Papers/2007-GATE-ESSLLI.pdf
-module PatternMatcher
- # ------------------------------------------------------------------------------
- # str.match(pattern[, pos[, max_edit_distance]])
- # -> Match or nil
- #
- # ------------------------------------------------------------------------------
- # Method to locate the next pattern match starting from a given position. A match
- # is allowed to contain a given maximum edit distance. If a match is located a
- # Match object will be returned otherwise nil.
- def match(pattern, pos = 0, max_edit_distance = 0)
- vector = Vector.new(@seq, pattern, max_edit_distance)
-
- while pos < @seq.length
- vector.update(pos)
-
- return vector.to_match(pos) if vector.match_found?
-
- pos += 1
- end
-
- nil # no match
- end
+require 'inline'
+module PatternMatcher
# ------------------------------------------------------------------------------
# str.scan(pattern[, pos[, max_edit_distance]])
# -> Array
def scan(pattern, pos = 0, max_edit_distance = 0)
matches = []
- while match = match(pattern, pos, max_edit_distance)
+ while result = match_C(self.seq, self.length, pattern, pattern.length, pos, max_edit_distance)
+ match = Match.new(*result, self.seq[result[0] ... result[0] + result[1]]);
+
if block_given?
yield match
else
matches << match
end
- pos = match.pos + 1
+ pos = match.beg + 1
end
return matches unless block_given?
end
-end
-
-# Class containing the score vector used for locating matches.
-class Vector
- # Method to initailize the score vector.
- def initialize(seq, pattern, max_edit_distance)
- @seq = seq
- @pattern = pattern
- @max_edit_distance = max_edit_distance
- @vector = []
-
- (0 ... @pattern.length + 1).each do |i|
- @vector[i] = Score.new(0, 0, i, 0, i)
- end
- end
-
- # Method to update the score vector.
- def update(pos)
- score_diag = @vector[0]
- score_up = Score.new # insertion
- score_left = @vector[1] # deletion
-
- (0 ... @pattern.length).each do |i|
- if match?(@seq[pos], @pattern[i])
- new_score = score_diag.dup
- new_score.matches += 1
- else
- if deletion?(score_diag, score_up, score_left)
- new_score = score_left.dup
- new_score.deletions += 1
- elsif mismatch?(score_diag, score_up, score_left)
- new_score = score_diag.dup
- new_score.mismatches += 1
- elsif insertion?(score_diag, score_up, score_left)
- new_score = score_up.dup
- new_score.insertions += 1
- end
-
- new_score.edit_distance += 1
- end
-
- score_diag = @vector[i + 1]
- score_up = new_score
- score_left = @vector[i + 2]
-
- @vector[i + 1] = new_score
- end
- end
-
- # Method that determines if a match was found by analyzing the score vector.
- def match_found?
- if @vector.last.edit_distance <= @max_edit_distance
- true
- end
- end
-
- # Method that returns a Match object initialized with
- # information from the score vector.
- def to_match(pos)
- matches = @vector.last.matches
- mismatches = @vector.last.mismatches
- insertions = @vector.last.insertions
- deletions = @vector.last.deletions
- length = @pattern.length - insertions + deletions
- offset = pos - length + 1
- match = @seq[offset ... offset + length]
-
- Match.new(offset, match, matches, mismatches, insertions, deletions, length)
- end
-
- # Method to convert the score vector to a string.
- def to_s
- "(m,m,i,d,e)\n" + @vector.join("\n") + "\n\n"
- end
-
- private
-
- # Method to determine if a match occurred.
- def match?(char1, char2)
- (EQUAL[char1.ord] & EQUAL[char2.ord]) != 0
- end
- # Method to determine if a mismatch occured.
- def mismatch?(score_diag, score_up, score_left)
- if score_diag.edit_distance <= score_up.edit_distance and
- score_diag.edit_distance <= score_left.edit_distance
- true
- end
+ inline do |builder|
+ # Macro for matching nucleotides including ambiguity codes.
+ builder.prefix %{
+ #define MAX_PAT 1024
+ }
+
+ builder.prefix %{
+ #define MATCH(A,B) ((bitmap[A] & bitmap[B]) != 0)
+ }
+
+ builder.prefix %{
+ typedef struct
+ {
+ unsigned int mis;
+ unsigned int ins;
+ unsigned int del;
+ unsigned int ed;
+ } score;
+ }
+
+ # Bitmap for matching nucleotides including ambiguity codes.
+ # For each value bits are set from the left: bit pos 1 for A,
+ # bit pos 2 for T, bit pos 3 for C, and bit pos 4 for G.
+ builder.prefix %{
+ char bitmap[256] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0,
+ 0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0,
+ 0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0,
+ 0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ };
+ }
+
+ builder.prefix %{
+ void vector_init(score *vec, unsigned int vec_len)
+ {
+ unsigned int i = 0;
+
+ for (i = 1; i < vec_len; i++)
+ {
+ vec[i].ins = i;
+ vec[i].ed = i;
+ }
+ }
+ }
+
+ builder.prefix %{
+ void vector_print(score *vec, unsigned int vec_len)
+ {
+ unsigned int i = 0;
+
+ for (i = 0; i < vec_len; i++)
+ {
+ printf("i: %d mis: %d ins: %d del: %d ed: %d\\n", i, vec[i].mis, vec[i].ins, vec[i].del, vec[i].ed);
+ }
+
+ printf("---\\n");
+ }
+ }
+
+ builder.prefix %{
+ int match_found(score *vec, unsigned int pat_len, unsigned int max_ed)
+ {
+ return (vec[pat_len].ed <= max_ed);
+ }
+ }
+
+ builder.prefix %{
+ void vector_update(score *vec, char *seq, char *pat, unsigned int pat_len, unsigned int pos)
+ {
+ score diag = vec[0];
+ score up = {0, 0, 0, 0}; // insertion
+ score left = vec[1]; // deletion
+ score new = {0, 0, 0, 0};
+
+ unsigned int i = 0;
+
+ for (i = 0; i < pat_len; i++)
+ {
+ if (MATCH(seq[pos], pat[i])) // match
+ {
+ new = diag;
+ }
+ else
+ {
+ if (left.ed <= diag.ed && left.ed <= up.ed) // deletion
+ {
+ new = left;
+ new.del++;
+ }
+ else if (diag.ed <= up.ed && diag.ed <= left.ed) // mismatch
+ {
+ new = diag;
+ new.mis++;
+ }
+ else if (up.ed <= diag.ed && up.ed <= left.ed) // insertion
+ {
+ new = up;
+ new.ins++;
+ }
+ else
+ {
+ printf("This should not happen\\n");
+ exit(1);
+ }
+
+ new.ed++;
+ }
+
+ diag = vec[i + 1];
+ up = new;
+ left = vec[i + 2];
+
+ vec[i + 1] = new;
+ }
+ }
+ }
+
+ builder.c %{
+ VALUE match_C(
+ VALUE _seq, // Sequence
+ VALUE _seq_len, // Sequence length
+ VALUE _pat, // Pattern
+ VALUE _pat_len, // Pattern length
+ VALUE _pos, // Offset position
+ VALUE _max_ed // Maximum edit distance
+ )
+ {
+ char *seq = (char *) StringValuePtr(_seq);
+ char *pat = (char *) StringValuePtr(_pat);
+ unsigned int seq_len = FIX2UINT(_seq_len);
+ unsigned int pat_len = FIX2UINT(_pat_len);
+ unsigned int pos = FIX2UINT(_pos);
+ unsigned int max_ed = FIX2UINT(_max_ed);
+
+ score vec[MAX_PAT] = {0};
+ unsigned int vec_len = pat_len + 1;
+ unsigned int match_beg = 0;
+ unsigned int match_len = 0;
+
+ VALUE match_ary;
+
+ vector_init(vec, vec_len);
+
+ while (pos < seq_len)
+ {
+ vector_update(vec, seq, pat, pat_len, pos);
+
+ if (match_found(vec, pat_len, max_ed))
+ {
+ match_len = pat_len - vec[pat_len].ins + vec[pat_len].del;
+ match_beg = pos - match_len + 1;
+
+ match_ary = rb_ary_new();
+ rb_ary_push(match_ary, INT2FIX(match_beg));
+ rb_ary_push(match_ary, INT2FIX(match_len));
+ rb_ary_push(match_ary, INT2FIX(vec[pat_len].mis));
+ rb_ary_push(match_ary, INT2FIX(vec[pat_len].ins));
+ rb_ary_push(match_ary, INT2FIX(vec[pat_len].del));
+
+ return match_ary;
+ }
+
+ pos++;
+ }
+
+ return Qfalse; // no match
+ }
+ }
end
- # Method to determine if an insertion occured.
- def insertion?(score_diag, score_up, score_left)
- if score_up.edit_distance <= score_diag.edit_distance and
- score_up.edit_distance <= score_left.edit_distance
- true
- end
- end
+ class Match
+ attr_accessor :beg, :length, :mis, :ins, :del, :match
- # Method to determine if a deletion occured.
- def deletion?(score_diag, score_up, score_left)
- if score_left.edit_distance <= score_diag.edit_distance and
- score_left.edit_distance <= score_up.edit_distance
- true
+ def initialize(beg, length, mis, ins, del, match)
+ @beg = beg
+ @length = length
+ @mis = mis
+ @ins = ins
+ @del = del
+ @match = match
end
end
end
-# Class to instantiate Score objects that holds score information.
-class Score
- attr_accessor :matches, :mismatches, :insertions, :deletions, :edit_distance
-
- def initialize(matches = 0, mismatches = 0, insertions = 0, deletions = 0, edit_distance = 0)
- @matches = matches
- @mismatches = mismatches
- @insertions = insertions
- @deletions = deletions
- @edit_distance = edit_distance
- end
- def to_s
- "(#{[self.matches, self.mismatches, self.insertions, self.deletions, self.edit_distance].join(',')})"
- end
-end
-
-# Class for creating Match objects which contain the description of a
-# match between a nucleotide sequence and a pattern.
-class Match
- attr_reader :pos, :match, :matches, :mismatches, :insertions, :deletions, :edit_distance, :length
-
- def initialize(pos, match, matches, mismatches, insertions, deletions, length)
- @pos = pos
- @match = match
- @matches = matches
- @mismatches = mismatches
- @insertions = insertions
- @deletions = deletions
- @edit_distance = mismatches + insertions + deletions
- @length = length
- end
-end
+__END__