From 04ea707c99bdd046adc73f34e60aeb006eabe45a Mon Sep 17 00:00:00 2001 From: martinahansen Date: Sat, 19 Mar 2011 14:44:52 +0000 Subject: [PATCH] split digest.rb from seq.rb git-svn-id: http://biopieces.googlecode.com/svn/trunk@1299 74ccb610-7750-0410-82ae-013aeee3265d --- code_ruby/Maasha/lib/digest.rb | 108 +++++++++++++++++++++++++++++++++ code_ruby/Maasha/lib/seq.rb | 84 +------------------------ 2 files changed, 109 insertions(+), 83 deletions(-) create mode 100644 code_ruby/Maasha/lib/digest.rb diff --git a/code_ruby/Maasha/lib/digest.rb b/code_ruby/Maasha/lib/digest.rb new file mode 100644 index 0000000..3268a9d --- /dev/null +++ b/code_ruby/Maasha/lib/digest.rb @@ -0,0 +1,108 @@ +# Copyright (C) 2007-2011 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This software is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +require 'seq' + +# Error class for all exceptions to do with Digest. +class DigestError < StandardError; end + +class Digest + include Enumerable + + # Initialize a digest object with the following arguments: + # - seq: A sequence object. + # - pattern: A restriction enzyme recognition pattern. + # - cut_pos: Offset from match position where the enzyme cuts. + def initialize(seq, pattern, cut_pos) + @seq = seq + @pattern = disambiguate(pattern) + @cut_pos = cut_pos + @offset = 0 + end + + # Method to get the next digestion product from a sequence. + def each + @seq.seq.upcase.scan @pattern do + pos = $`.length + @cut_pos - 1 + + if pos >= 0 and pos < @seq.length - 2 + seq = Seq.new("#{@seq.seq_name}[#{@offset}-#{pos}]", @seq.seq[@offset .. pos], @seq.type) + + yield seq + end + + @offset = pos + 1 + end + + if @offset < 0 + @offset = 0 + elsif @offset > @seq.length + @offset = 0 + end + + seq = Seq.new("#{@seq.seq_name}[#{@offset}-#{@seq.length - 1}]", @seq.seq[@offset .. @seq.length], @seq.type) + + yield seq + + self # conventionally + end + + private + + # Method that returns a regexp object with a restriction + # enzyme pattern with ambiguity codes substituted to the + # appropriate regexp. + def disambiguate(pattern) + ambiguity = { + 'A' => "A", + 'T' => "T", + 'U' => "T", + 'C' => "C", + 'G' => "G", + 'M' => "[AC]", + 'R' => "[AG]", + 'W' => "[AT]", + 'S' => "[CG]", + 'Y' => "[CT]", + 'K' => "[GT]", + 'V' => "[ACG]", + 'H' => "[ACT]", + 'D' => "[AGT]", + 'B' => "[CGT]", + 'N' => "[GATC]" + } + + new_pattern = "" + + pattern.upcase.each_char do |char| + if ambiguity.has_key? char + new_pattern << ambiguity[char] + else + raise DigestError, "Could not disambiguate residue: #{char}" + end + end + + Regexp.new(new_pattern) + end +end diff --git a/code_ruby/Maasha/lib/seq.rb b/code_ruby/Maasha/lib/seq.rb index a63741f..bf07ab3 100644 --- a/code_ruby/Maasha/lib/seq.rb +++ b/code_ruby/Maasha/lib/seq.rb @@ -23,6 +23,7 @@ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< require 'amatch' +require 'digest' # Residue alphabets DNA = %w[a t c g] @@ -418,89 +419,6 @@ class Seq end end -# Error class for all exceptions to do with Digest. -class DigestError < StandardError; end - -class Digest - include Enumerable - - # Initialize a digest object with the following arguments: - # - seq: A sequence object. - # - pattern: A restriction enzyme recognition pattern. - # - cut_pos: Offset from match position where the enzyme cuts. - def initialize(seq, pattern, cut_pos) - @seq = seq - @pattern = disambiguate(pattern) - @cut_pos = cut_pos - @offset = 0 - end - - # Method to get the next digestion product from a sequence. - def each - @seq.seq.upcase.scan @pattern do - pos = $`.length + @cut_pos - 1 - - if pos >= 0 and pos < @seq.length - 2 - seq = Seq.new("#{@seq.seq_name}[#{@offset}-#{pos}]", @seq.seq[@offset .. pos], @seq.type) - - yield seq - end - - @offset = pos + 1 - end - - if @offset < 0 - @offset = 0 - elsif @offset > @seq.length - @offset = 0 - end - - seq = Seq.new("#{@seq.seq_name}[#{@offset}-#{@seq.length - 1}]", @seq.seq[@offset .. @seq.length], @seq.type) - - yield seq - - self # conventionally - end - - private - - # Method that returns a regexp object with a restriction - # enzyme pattern with ambiguity codes substituted to the - # appropriate regexp. - def disambiguate(pattern) - ambiguity = { - 'A' => "A", - 'T' => "T", - 'U' => "T", - 'C' => "C", - 'G' => "G", - 'M' => "[AC]", - 'R' => "[AG]", - 'W' => "[AT]", - 'S' => "[CG]", - 'Y' => "[CT]", - 'K' => "[GT]", - 'V' => "[ACG]", - 'H' => "[ACT]", - 'D' => "[AGT]", - 'B' => "[CGT]", - 'N' => "[GATC]" - } - - new_pattern = "" - - pattern.upcase.each_char do |char| - if ambiguity.has_key? char - new_pattern << ambiguity[char] - else - raise DigestError, "Could not disambiguate residue: #{char}" - end - end - - Regexp.new(new_pattern) - end -end - __END__ -- 2.39.5