From fca95d66e32c26175555a254142375fffc9e0056 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Mon, 3 Dec 2012 21:45:03 +0000 Subject: [PATCH] ported patternmatcher to C git-svn-id: http://biopieces.googlecode.com/svn/trunk@2022 74ccb610-7750-0410-82ae-013aeee3265d --- code_ruby/lib/maasha/seq.rb | 5 - code_ruby/lib/maasha/seq/patternmatcher.rb | 400 ++++++++++----------- 2 files changed, 196 insertions(+), 209 deletions(-) diff --git a/code_ruby/lib/maasha/seq.rb b/code_ruby/lib/maasha/seq.rb index 41c221d..913f4d9 100644 --- a/code_ruby/lib/maasha/seq.rb +++ b/code_ruby/lib/maasha/seq.rb @@ -23,10 +23,7 @@ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< require 'maasha/bits' -#require 'maasha/seq/backtrack' require 'maasha/seq/digest' -#require 'maasha/seq/patscan' -require 'maasha/seq/patternmatcher' require 'maasha/seq/trim' require 'narray' @@ -76,8 +73,6 @@ SCORE_MAX = 40 class SeqError < StandardError; end class Seq - #include Patscan - include PatternMatcher include Digest include Trim diff --git a/code_ruby/lib/maasha/seq/patternmatcher.rb b/code_ruby/lib/maasha/seq/patternmatcher.rb index ba83c15..415c337 100644 --- a/code_ruby/lib/maasha/seq/patternmatcher.rb +++ b/code_ruby/lib/maasha/seq/patternmatcher.rb @@ -22,78 +22,9 @@ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# IUPAC nucleotide pair ambiguity equivalents are saved in an -# array of bit fields. - -BIT_A = 1 << 0 -BIT_T = 1 << 1 -BIT_C = 1 << 2 -BIT_G = 1 << 3 - -EQUAL = Array.new(256, 0) -EQUAL['A'.ord] = BIT_A -EQUAL['a'.ord] = BIT_A -EQUAL['T'.ord] = BIT_T -EQUAL['t'.ord] = BIT_T -EQUAL['U'.ord] = BIT_T -EQUAL['u'.ord] = BIT_T -EQUAL['C'.ord] = BIT_C -EQUAL['c'.ord] = BIT_C -EQUAL['G'.ord] = BIT_G -EQUAL['g'.ord] = BIT_G -EQUAL['M'.ord] = (BIT_A|BIT_C) -EQUAL['m'.ord] = (BIT_A|BIT_C) -EQUAL['R'.ord] = (BIT_A|BIT_G) -EQUAL['r'.ord] = (BIT_A|BIT_G) -EQUAL['W'.ord] = (BIT_A|BIT_T) -EQUAL['w'.ord] = (BIT_A|BIT_T) -EQUAL['S'.ord] = (BIT_C|BIT_G) -EQUAL['s'.ord] = (BIT_C|BIT_G) -EQUAL['Y'.ord] = (BIT_C|BIT_T) -EQUAL['y'.ord] = (BIT_C|BIT_T) -EQUAL['K'.ord] = (BIT_G|BIT_T) -EQUAL['k'.ord] = (BIT_G|BIT_T) -EQUAL['B'.ord] = (BIT_C|BIT_G|BIT_T) -EQUAL['b'.ord] = (BIT_C|BIT_G|BIT_T) -EQUAL['D'.ord] = (BIT_A|BIT_G|BIT_T) -EQUAL['d'.ord] = (BIT_A|BIT_G|BIT_T) -EQUAL['H'.ord] = (BIT_A|BIT_C|BIT_T) -EQUAL['h'.ord] = (BIT_A|BIT_C|BIT_T) -EQUAL['V'.ord] = (BIT_A|BIT_C|BIT_G) -EQUAL['v'.ord] = (BIT_A|BIT_C|BIT_G) -EQUAL['N'.ord] = (BIT_A|BIT_C|BIT_G|BIT_T) -EQUAL['n'.ord] = (BIT_A|BIT_C|BIT_G|BIT_T) - -# Module containing code to locate nucleotide patterns in sequences allowing for -# ambiguity codes and a given maximum edit distance. -# Insertions are nucleotides found in the pattern but not in the sequence. -# Deletions are nucleotides found in the sequence but not in the pattern. -# -# Inspired by the paper by Bruno Woltzenlogel Paleo (page 197): -# http://www.logic.at/people/bruno/Papers/2007-GATE-ESSLLI.pdf -module PatternMatcher - # ------------------------------------------------------------------------------ - # str.match(pattern[, pos[, max_edit_distance]]) - # -> Match or nil - # - # ------------------------------------------------------------------------------ - # Method to locate the next pattern match starting from a given position. A match - # is allowed to contain a given maximum edit distance. If a match is located a - # Match object will be returned otherwise nil. - def match(pattern, pos = 0, max_edit_distance = 0) - vector = Vector.new(@seq, pattern, max_edit_distance) - - while pos < @seq.length - vector.update(pos) - - return vector.to_match(pos) if vector.match_found? - - pos += 1 - end - - nil # no match - end +require 'inline' +module PatternMatcher # ------------------------------------------------------------------------------ # str.scan(pattern[, pos[, max_edit_distance]]) # -> Array @@ -110,155 +41,216 @@ module PatternMatcher def scan(pattern, pos = 0, max_edit_distance = 0) matches = [] - while match = match(pattern, pos, max_edit_distance) + while result = match_C(self.seq, self.length, pattern, pattern.length, pos, max_edit_distance) + match = Match.new(*result, self.seq[result[0] ... result[0] + result[1]]); + if block_given? yield match else matches << match end - pos = match.pos + 1 + pos = match.beg + 1 end return matches unless block_given? end -end - -# Class containing the score vector used for locating matches. -class Vector - # Method to initailize the score vector. - def initialize(seq, pattern, max_edit_distance) - @seq = seq - @pattern = pattern - @max_edit_distance = max_edit_distance - @vector = [] - - (0 ... @pattern.length + 1).each do |i| - @vector[i] = Score.new(0, 0, i, 0, i) - end - end - - # Method to update the score vector. - def update(pos) - score_diag = @vector[0] - score_up = Score.new # insertion - score_left = @vector[1] # deletion - - (0 ... @pattern.length).each do |i| - if match?(@seq[pos], @pattern[i]) - new_score = score_diag.dup - new_score.matches += 1 - else - if deletion?(score_diag, score_up, score_left) - new_score = score_left.dup - new_score.deletions += 1 - elsif mismatch?(score_diag, score_up, score_left) - new_score = score_diag.dup - new_score.mismatches += 1 - elsif insertion?(score_diag, score_up, score_left) - new_score = score_up.dup - new_score.insertions += 1 - end - - new_score.edit_distance += 1 - end - - score_diag = @vector[i + 1] - score_up = new_score - score_left = @vector[i + 2] - - @vector[i + 1] = new_score - end - end - - # Method that determines if a match was found by analyzing the score vector. - def match_found? - if @vector.last.edit_distance <= @max_edit_distance - true - end - end - - # Method that returns a Match object initialized with - # information from the score vector. - def to_match(pos) - matches = @vector.last.matches - mismatches = @vector.last.mismatches - insertions = @vector.last.insertions - deletions = @vector.last.deletions - length = @pattern.length - insertions + deletions - offset = pos - length + 1 - match = @seq[offset ... offset + length] - - Match.new(offset, match, matches, mismatches, insertions, deletions, length) - end - - # Method to convert the score vector to a string. - def to_s - "(m,m,i,d,e)\n" + @vector.join("\n") + "\n\n" - end - - private - - # Method to determine if a match occurred. - def match?(char1, char2) - (EQUAL[char1.ord] & EQUAL[char2.ord]) != 0 - end - # Method to determine if a mismatch occured. - def mismatch?(score_diag, score_up, score_left) - if score_diag.edit_distance <= score_up.edit_distance and - score_diag.edit_distance <= score_left.edit_distance - true - end + inline do |builder| + # Macro for matching nucleotides including ambiguity codes. + builder.prefix %{ + #define MAX_PAT 1024 + } + + builder.prefix %{ + #define MATCH(A,B) ((bitmap[A] & bitmap[B]) != 0) + } + + builder.prefix %{ + typedef struct + { + unsigned int mis; + unsigned int ins; + unsigned int del; + unsigned int ed; + } score; + } + + # Bitmap for matching nucleotides including ambiguity codes. + # For each value bits are set from the left: bit pos 1 for A, + # bit pos 2 for T, bit pos 3 for C, and bit pos 4 for G. + builder.prefix %{ + char bitmap[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0, + 0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0, + 0, 1,14, 4,11, 0, 0, 8, 7, 0, 0,10, 0, 5,15, 0, + 0, 0, 9,12, 2, 2,13, 3, 0, 6, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + } + + builder.prefix %{ + void vector_init(score *vec, unsigned int vec_len) + { + unsigned int i = 0; + + for (i = 1; i < vec_len; i++) + { + vec[i].ins = i; + vec[i].ed = i; + } + } + } + + builder.prefix %{ + void vector_print(score *vec, unsigned int vec_len) + { + unsigned int i = 0; + + for (i = 0; i < vec_len; i++) + { + printf("i: %d mis: %d ins: %d del: %d ed: %d\\n", i, vec[i].mis, vec[i].ins, vec[i].del, vec[i].ed); + } + + printf("---\\n"); + } + } + + builder.prefix %{ + int match_found(score *vec, unsigned int pat_len, unsigned int max_ed) + { + return (vec[pat_len].ed <= max_ed); + } + } + + builder.prefix %{ + void vector_update(score *vec, char *seq, char *pat, unsigned int pat_len, unsigned int pos) + { + score diag = vec[0]; + score up = {0, 0, 0, 0}; // insertion + score left = vec[1]; // deletion + score new = {0, 0, 0, 0}; + + unsigned int i = 0; + + for (i = 0; i < pat_len; i++) + { + if (MATCH(seq[pos], pat[i])) // match + { + new = diag; + } + else + { + if (left.ed <= diag.ed && left.ed <= up.ed) // deletion + { + new = left; + new.del++; + } + else if (diag.ed <= up.ed && diag.ed <= left.ed) // mismatch + { + new = diag; + new.mis++; + } + else if (up.ed <= diag.ed && up.ed <= left.ed) // insertion + { + new = up; + new.ins++; + } + else + { + printf("This should not happen\\n"); + exit(1); + } + + new.ed++; + } + + diag = vec[i + 1]; + up = new; + left = vec[i + 2]; + + vec[i + 1] = new; + } + } + } + + builder.c %{ + VALUE match_C( + VALUE _seq, // Sequence + VALUE _seq_len, // Sequence length + VALUE _pat, // Pattern + VALUE _pat_len, // Pattern length + VALUE _pos, // Offset position + VALUE _max_ed // Maximum edit distance + ) + { + char *seq = (char *) StringValuePtr(_seq); + char *pat = (char *) StringValuePtr(_pat); + unsigned int seq_len = FIX2UINT(_seq_len); + unsigned int pat_len = FIX2UINT(_pat_len); + unsigned int pos = FIX2UINT(_pos); + unsigned int max_ed = FIX2UINT(_max_ed); + + score vec[MAX_PAT] = {0}; + unsigned int vec_len = pat_len + 1; + unsigned int match_beg = 0; + unsigned int match_len = 0; + + VALUE match_ary; + + vector_init(vec, vec_len); + + while (pos < seq_len) + { + vector_update(vec, seq, pat, pat_len, pos); + + if (match_found(vec, pat_len, max_ed)) + { + match_len = pat_len - vec[pat_len].ins + vec[pat_len].del; + match_beg = pos - match_len + 1; + + match_ary = rb_ary_new(); + rb_ary_push(match_ary, INT2FIX(match_beg)); + rb_ary_push(match_ary, INT2FIX(match_len)); + rb_ary_push(match_ary, INT2FIX(vec[pat_len].mis)); + rb_ary_push(match_ary, INT2FIX(vec[pat_len].ins)); + rb_ary_push(match_ary, INT2FIX(vec[pat_len].del)); + + return match_ary; + } + + pos++; + } + + return Qfalse; // no match + } + } end - # Method to determine if an insertion occured. - def insertion?(score_diag, score_up, score_left) - if score_up.edit_distance <= score_diag.edit_distance and - score_up.edit_distance <= score_left.edit_distance - true - end - end + class Match + attr_accessor :beg, :length, :mis, :ins, :del, :match - # Method to determine if a deletion occured. - def deletion?(score_diag, score_up, score_left) - if score_left.edit_distance <= score_diag.edit_distance and - score_left.edit_distance <= score_up.edit_distance - true + def initialize(beg, length, mis, ins, del, match) + @beg = beg + @length = length + @mis = mis + @ins = ins + @del = del + @match = match end end end -# Class to instantiate Score objects that holds score information. -class Score - attr_accessor :matches, :mismatches, :insertions, :deletions, :edit_distance - - def initialize(matches = 0, mismatches = 0, insertions = 0, deletions = 0, edit_distance = 0) - @matches = matches - @mismatches = mismatches - @insertions = insertions - @deletions = deletions - @edit_distance = edit_distance - end - def to_s - "(#{[self.matches, self.mismatches, self.insertions, self.deletions, self.edit_distance].join(',')})" - end -end - -# Class for creating Match objects which contain the description of a -# match between a nucleotide sequence and a pattern. -class Match - attr_reader :pos, :match, :matches, :mismatches, :insertions, :deletions, :edit_distance, :length - - def initialize(pos, match, matches, mismatches, insertions, deletions, length) - @pos = pos - @match = match - @matches = matches - @mismatches = mismatches - @insertions = insertions - @deletions = deletions - @edit_distance = mismatches + insertions + deletions - @length = length - end -end +__END__ -- 2.39.5