From: martinahansen Date: Fri, 15 Apr 2011 12:38:49 +0000 (+0000) Subject: pattern hell X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=e982994fb2b3fc1ec67844bbfb06c643e9793417;p=biopieces.git pattern hell git-svn-id: http://biopieces.googlecode.com/svn/trunk@1328 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/code_ruby/Maasha/lib/filesys.rb b/code_ruby/Maasha/lib/filesys.rb index 7e0deaa..9024a58 100644 --- a/code_ruby/Maasha/lib/filesys.rb +++ b/code_ruby/Maasha/lib/filesys.rb @@ -30,6 +30,16 @@ class FilesysError < StandardError; end class Filesys include Enumerable + # Class method that returns a path to a unique temporary file. + # If no directory is specified reverts to the systems tmp directory. + def self.tmpfile(tmp_dir = ENV["TMPDIR"]) + time = Time.now.to_i + user = ENV["USER"] + pid = $$ + path = tmp_dir + [user, time + pid, pid].join("_") + ".tmp" + path + end + # Class method allowing open to be used on (zipped) files. # See File.open. def self.open(*args) diff --git a/code_ruby/Maasha/lib/patscan.rb b/code_ruby/Maasha/lib/patscan.rb new file mode 100644 index 0000000..e0ae706 --- /dev/null +++ b/code_ruby/Maasha/lib/patscan.rb @@ -0,0 +1,190 @@ +# Copyright (C) 2007-2011 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This software is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +require 'filesys' +require 'open3' + +# Module containing code to locate nucleotide patterns in sequences allowing for +# ambiguity codes and a given maximum mismatches, insertions, and deletions. The +# pattern match engine is based on scan_for_matches. +module Patscan + # ------------------------------------------------------------------------------ + # str.scan(pattern[, max_mismatches [, max_insertions [,max_deletions]]]) + # -> Array + # str.scan(pattern[, max_mismatches [, max_insertions [,max_deletions]]]) { |match| + # block + # } + # -> Match + # + # ------------------------------------------------------------------------------ + # Method to iterate through a sequence to locate pattern matches starting + # allowing for a maximum number of mismatches, insertions, and deletions. + # Matches found in block context return the Match object. Otherwise matches are + # returned in an Array. + def scan(pattern, max_mismatches = 0, max_insertions = 0, max_deletions = 0) + @pattern = pattern + "[#{max_mismatches},#{max_insertions},#{max_deletions}]" + matches = [] + + pattern_file = Filesys.tmpfile + + File.open(pattern_file, mode = 'w') do |ios| + ios.puts @pattern + end + + cmd = "scan_for_matches #{pattern_file}" + + Open3.popen3(cmd) do |stdin, stdout, stderr| + #raise SeqError, "scan_for_matches failed: #{stderr.gets}" unless stderr.empty? + + stdin << ">dummy\n#{self.seq}\n" + + stdin.close + + while block = stdout.gets($/ + ">") + if block =~ />dummy:\[(\d+),(\d+)\]\n(.+)/ + start = $1.to_i + stop = $2.to_i + match = $3.delete(" ") + length = stop - start + + if block_given? + yield Match.new(start, length, match) + else + matches << Match.new(start, length, match) + end + end + end + end + + matches unless block_given? + end + + class Match + attr_reader :pos, :length, :match + + def initialize(pos, length, match) + @pos = pos + @length = length + @match = match + end + end +end + +__END__ + + +# Eeeeiiiii - this one is slower than scan_for_matches! + + +# Module containing code to locate nucleotide patterns in sequences allowing for +# ambiguity codes and a given maximum edit distance. Pattern match engine is based +# on nrgrep. +module Patscan + # ------------------------------------------------------------------------------ + # str.scan(pattern[, max_edit_distance]) + # -> Array + # str.scan(pattern[, max_edit_distance]) { |match| + # block + # } + # -> Match + # + # ------------------------------------------------------------------------------ + # Method to iterate through a sequence to locate pattern matches starting + # from a given position and allowing for a maximum edit distance. + # Matches found in block context return the Match object. Otherwise matches are + # returned in an Array. + def scan(pattern, edit_distance = 0) + @pattern = pattern.upcase + matches = [] + + pattern_disambiguate + + cmd = "nrgrep_coords" + cmd << " -i" + cmd << " -k #{edit_distance}s" if edit_distance > 0 + cmd << " #{@pattern}" + + Open3.popen3(cmd) do |stdin, stdout, stderr| + stdin << self.seq + + stdin.close + + stdout.each_line do |line| + if line =~ /\[(\d+), (\d+)\]: (.+)/ + start = $1.to_i + stop = $2.to_i + match = $3 + length = stop - start + + if block_given? + yield Match.new(start, length, match) + else + matches << Match.new(start, length, match) + end + end + end + end + + matches unless block_given? + end + + private + + def pattern_disambiguate + case self.type + when 'protein' + @pattern.gsub!('J', '[IFVLWMAGCY]') + @pattern.gsub!('O', '[TSHEDQNKR]') + @pattern.gsub!('B', '[DN]') + @pattern.gsub!('Z', '[EQ]') + @pattern.gsub!('X', '.') + when 'dna' + @pattern.gsub!('R', '[AG]') + @pattern.gsub!('Y', '[CT]') + @pattern.gsub!('S', '[GC]') + @pattern.gsub!('W', '[AT]') + @pattern.gsub!('M', '[AC]') + @pattern.gsub!('K', '[GT]') + @pattern.gsub!('V', '[ACG]') + @pattern.gsub!('H', '[ACT]') + @pattern.gsub!('D', '[AGT]') + @pattern.gsub!('B', '[CGT]') + @pattern.gsub!('N', '.') + when 'rna' + else + raise SeqError "unknown sequence type: #{self.type}" + end + end + + class Match + attr_reader :pos, :length, :match + + def initialize(pos, length, match) + @pos = pos + @length = length + @match = match + end + end +end + diff --git a/code_ruby/Maasha/lib/seq.rb b/code_ruby/Maasha/lib/seq.rb index 64dc0db..1227ac4 100644 --- a/code_ruby/Maasha/lib/seq.rb +++ b/code_ruby/Maasha/lib/seq.rb @@ -23,6 +23,7 @@ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< require 'digest' +#require 'patscan' require 'patternmatcher' # Residue alphabets @@ -39,6 +40,7 @@ SCORE_ILLUMINA = 64 class SeqError < StandardError; end class Seq + #include Patscan include PatternMatcher attr_accessor :seq_name, :seq, :type, :qual @@ -310,40 +312,6 @@ class Seq ((self.seq.scan(/[a-z]/).size.to_f / (self.len - self.indels).to_f) * 100).round(2) end - # Method that finds an adaptor or part thereof in the sequence of a Seq object. - # Returns a Match object if the adaptor was found otherwise nil. The ed_percent - # indicates the maximum edit distance allowed in all possible overlaps. - def adaptor_find(adaptor, ed_percent = 0) - raise SeqError, "Edit distance percent out of range #{ed_percent}" unless (0 .. 100).include? ed_percent - - pos = 0 - - while adaptor.length > 0 - ed_max = (adaptor.length * ed_percent * 0.01).round - - match = self.match(adaptor, pos, ed_max) - - return match unless match.nil? - - adaptor = adaptor[0 ... -1] - - pos = self.len - adaptor.length - end - end - - # Method that locates an adaptor or part thereof in the sequence - # of a Seq object beginning from the right and removes the adaptor - # sequence if found. The hd_percent is used to calculate the - # maximum hamming distance allowed for all possible overlaps. - def adaptor_clip_right(adaptor, hd_percent = 0) - pos = self.adaptor_locate_right(adaptor, hd_percent) - - if pos > 0 - self.seq = self.seq[0 ... pos] - self.qual = self.qual[0 ... pos] unless self.qual.nil? - end - end - # Method to convert the quality scores from a specified base # to another base. def convert_phred2illumina! diff --git a/code_ruby/Maasha/test/test_seq.rb b/code_ruby/Maasha/test/test_seq.rb index 2a00c90..e463e6f 100755 --- a/code_ruby/Maasha/test/test_seq.rb +++ b/code_ruby/Maasha/test/test_seq.rb @@ -331,50 +331,6 @@ class TestSeq < Test::Unit::TestCase @entry.seq = "--AAAa" assert_equal(25.00, @entry.soft_mask) end - - def test_Seq_adaptor_find_with_bad_ed_percent_raises - @entry.seq = "actagctagctacgtacg" - assert_raise(SeqError) { @entry.adaptor_find("tacg", ed_percent = -1) } - assert_raise(SeqError) { @entry.adaptor_find("tacg", ed_percent = 101) } - end - - def test_Seq_adaptor_find_with_ok_ed_percent_dont_raise - @entry.seq = "actagctagctacgtacg" - assert_nothing_raised { @entry.adaptor_find("tacg", ed_percent = 0) } - assert_nothing_raised { @entry.adaptor_find("tacg", ed_percent = 100) } - end - - def test_Seq_adaptor_find_with_no_match_returns_nil - @entry.seq = "actaaggctagctacgtccg" - assert_nil(@entry.adaptor_find("TTTT")) - end - - def test_Seq_adaptor_find_returns_correct_match - @entry.seq = "actaaggctagctacgtccg" - assert_equal(0, @entry.adaptor_find("actaa").pos) - assert_equal(7, @entry.adaptor_find("ctagc").pos) - assert_equal(15, @entry.adaptor_find("gtccg").pos) - assert_equal(17, @entry.adaptor_find("ccgTT").pos) - assert_equal(18, @entry.adaptor_find("cgTTT").pos) - assert_equal(19, @entry.adaptor_find("gTTTT").pos) - end - - def test_Seq_adaptor_with_mis_and_ed_percent_returns_correct_match - @entry.seq = "actaaggctagctacgtccg" - assert_equal(0, @entry.adaptor_find("GGGaag", ed_percent = 50).pos) - assert_equal(14, @entry.adaptor_find("cgtcTTTT", ed_percent = 50).pos) - end - - def test_Seq_adaptor_with_ins_and_ed_percent_returns_correct_match - @entry.seq = "actaaggctagctacgtccg" - assert_equal(0, @entry.adaptor_find("actGGGaag", ed_percent = 50).pos) - assert_equal(15, @entry.adaptor_find("gtAccgTTTTT", ed_percent = 10).pos) - end - - def test_Seq_adaptor_with_del_and_ed_percent_returns_correct_match - @entry.seq = "actaaggctagctacgtccg" - assert_equal(0, @entry.adaptor_find("actctag", ed_percent = 50).pos) - end end