From 5f03def586723979d479b52dea50d433a643af33 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Sat, 16 Apr 2011 07:08:08 +0000 Subject: [PATCH] cleaned patternmatcher code git-svn-id: http://biopieces.googlecode.com/svn/trunk@1329 74ccb610-7750-0410-82ae-013aeee3265d --- code_ruby/Maasha/lib/patternmatcher.rb | 61 ++++++++++++++------------ 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/code_ruby/Maasha/lib/patternmatcher.rb b/code_ruby/Maasha/lib/patternmatcher.rb index 478ffba..8dfc66e 100644 --- a/code_ruby/Maasha/lib/patternmatcher.rb +++ b/code_ruby/Maasha/lib/patternmatcher.rb @@ -22,28 +22,31 @@ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# IUPAC nucleotide pair ambiguity equivalents -EQUAL = { - :AA => true, :BU => true, :TH => true, :UY => true, - :TT => true, :CB => true, :UH => true, :SC => true, - :CC => true, :GB => true, :VA => true, :SG => true, - :GG => true, :TB => true, :VC => true, :CS => true, - :UU => true, :UB => true, :VG => true, :GS => true, - :NA => true, :DA => true, :AV => true, :WA => true, - :NT => true, :DG => true, :CV => true, :WT => true, - :NC => true, :DT => true, :GV => true, :WU => true, - :NG => true, :DU => true, :KG => true, :AW => true, - :NU => true, :AD => true, :KT => true, :TW => true, - :AN => true, :GD => true, :KU => true, :UW => true, - :TN => true, :TD => true, :GK => true, :RA => true, - :CN => true, :UD => true, :TK => true, :RG => true, - :GN => true, :HA => true, :UK => true, :AR => true, - :UN => true, :HC => true, :YC => true, :GR => true, - :NN => true, :HT => true, :YT => true, :MA => true, - :BC => true, :HU => true, :YU => true, :MC => true, - :BG => true, :AH => true, :CY => true, :AM => true, - :BT => true, :CH => true, :TY => true, :CM => true, -} +# IUPAC nucleotide pair ambiguity equivalents are saved in an +# array of bit fields. + +BIT_A = 1 << 0 +BIT_T = 1 << 1 +BIT_C = 1 << 2 +BIT_G = 1 << 3 + +EQUAL = Array.new(256, 0) +EQUAL['A'.ord] = BIT_A +EQUAL['T'.ord] = BIT_T +EQUAL['U'.ord] = BIT_T +EQUAL['C'.ord] = BIT_C +EQUAL['G'.ord] = BIT_G +EQUAL['M'.ord] = (BIT_A|BIT_C) +EQUAL['R'.ord] = (BIT_A|BIT_G) +EQUAL['W'.ord] = (BIT_A|BIT_T) +EQUAL['S'.ord] = (BIT_C|BIT_G) +EQUAL['Y'.ord] = (BIT_C|BIT_T) +EQUAL['K'.ord] = (BIT_G|BIT_T) +EQUAL['B'.ord] = (BIT_C|BIT_G|BIT_T) +EQUAL['D'.ord] = (BIT_A|BIT_G|BIT_T) +EQUAL['H'.ord] = (BIT_A|BIT_C|BIT_T) +EQUAL['V'.ord] = (BIT_A|BIT_C|BIT_G) +EQUAL['N'.ord] = (BIT_A|BIT_C|BIT_G|BIT_T) # Module containing code to locate nucleotide patterns in sequences allowing for # ambiguity codes and a given maximum edit distance. @@ -91,16 +94,15 @@ module PatternMatcher # returned in an Array. def scan(pattern, pos = 0, max_edit_distance = 0) matches = [] - offset = pos - while match = match(pattern, offset, max_edit_distance) + while match = match(pattern, pos, max_edit_distance) if block_given? yield match else matches << match end - offset = match.pos + 1 + pos = match.pos + 1 end return matches unless block_given? @@ -124,7 +126,7 @@ module PatternMatcher new_vector = @vector.dup (0 ... @pattern.length).each do |i| - if EQUAL[(@seq[@pos] + @pattern[i]).upcase.to_sym] + if match?(@seq[@pos], @pattern[i]) new_vector[i + 1] = @vector[i].dup new_vector[i + 1].matches += 1 else @@ -141,8 +143,6 @@ module PatternMatcher elsif insertion?(mismatch, insertion, deletion) insertion.insertions += 1 new_vector[i + 1] = insertion - else - raise "AAAAarrgh" end end end @@ -150,6 +150,11 @@ module PatternMatcher @vector = new_vector end + # Method to determine if a match occurred. + def match?(char1, char2) + (EQUAL[char1.upcase.ord] & EQUAL[char2.upcase.ord]) != 0 + end + # Method to determine if a mismatch occured. def mismatch?(mismatch, insertion, deletion) if mismatch.edit_distance <= insertion.edit_distance and -- 2.39.5