--- /dev/null
+# Copyright (C) 2007-2011 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# This software is part of the Biopieces framework (www.biopieces.org).
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+require 'filesys'
+require 'open3'
+
+# Module containing code to locate nucleotide patterns in sequences allowing for
+# ambiguity codes and a given maximum mismatches, insertions, and deletions. The
+# pattern match engine is based on scan_for_matches.
+module Patscan
+ # ------------------------------------------------------------------------------
+ # str.scan(pattern[, max_mismatches [, max_insertions [,max_deletions]]])
+ # -> Array
+ # str.scan(pattern[, max_mismatches [, max_insertions [,max_deletions]]]) { |match|
+ # block
+ # }
+ # -> Match
+ #
+ # ------------------------------------------------------------------------------
+ # Method to iterate through a sequence to locate pattern matches starting
+ # allowing for a maximum number of mismatches, insertions, and deletions.
+ # Matches found in block context return the Match object. Otherwise matches are
+ # returned in an Array.
+ def scan(pattern, max_mismatches = 0, max_insertions = 0, max_deletions = 0)
+ @pattern = pattern + "[#{max_mismatches},#{max_insertions},#{max_deletions}]"
+ matches = []
+
+ pattern_file = Filesys.tmpfile
+
+ File.open(pattern_file, mode = 'w') do |ios|
+ ios.puts @pattern
+ end
+
+ cmd = "scan_for_matches #{pattern_file}"
+
+ Open3.popen3(cmd) do |stdin, stdout, stderr|
+ #raise SeqError, "scan_for_matches failed: #{stderr.gets}" unless stderr.empty?
+
+ stdin << ">dummy\n#{self.seq}\n"
+
+ stdin.close
+
+ while block = stdout.gets($/ + ">")
+ if block =~ />dummy:\[(\d+),(\d+)\]\n(.+)/
+ start = $1.to_i
+ stop = $2.to_i
+ match = $3.delete(" ")
+ length = stop - start
+
+ if block_given?
+ yield Match.new(start, length, match)
+ else
+ matches << Match.new(start, length, match)
+ end
+ end
+ end
+ end
+
+ matches unless block_given?
+ end
+
+ class Match
+ attr_reader :pos, :length, :match
+
+ def initialize(pos, length, match)
+ @pos = pos
+ @length = length
+ @match = match
+ end
+ end
+end
+
+__END__
+
+
+# Eeeeiiiii - this one is slower than scan_for_matches!
+
+
+# Module containing code to locate nucleotide patterns in sequences allowing for
+# ambiguity codes and a given maximum edit distance. Pattern match engine is based
+# on nrgrep.
+module Patscan
+ # ------------------------------------------------------------------------------
+ # str.scan(pattern[, max_edit_distance])
+ # -> Array
+ # str.scan(pattern[, max_edit_distance]) { |match|
+ # block
+ # }
+ # -> Match
+ #
+ # ------------------------------------------------------------------------------
+ # Method to iterate through a sequence to locate pattern matches starting
+ # from a given position and allowing for a maximum edit distance.
+ # Matches found in block context return the Match object. Otherwise matches are
+ # returned in an Array.
+ def scan(pattern, edit_distance = 0)
+ @pattern = pattern.upcase
+ matches = []
+
+ pattern_disambiguate
+
+ cmd = "nrgrep_coords"
+ cmd << " -i"
+ cmd << " -k #{edit_distance}s" if edit_distance > 0
+ cmd << " #{@pattern}"
+
+ Open3.popen3(cmd) do |stdin, stdout, stderr|
+ stdin << self.seq
+
+ stdin.close
+
+ stdout.each_line do |line|
+ if line =~ /\[(\d+), (\d+)\]: (.+)/
+ start = $1.to_i
+ stop = $2.to_i
+ match = $3
+ length = stop - start
+
+ if block_given?
+ yield Match.new(start, length, match)
+ else
+ matches << Match.new(start, length, match)
+ end
+ end
+ end
+ end
+
+ matches unless block_given?
+ end
+
+ private
+
+ def pattern_disambiguate
+ case self.type
+ when 'protein'
+ @pattern.gsub!('J', '[IFVLWMAGCY]')
+ @pattern.gsub!('O', '[TSHEDQNKR]')
+ @pattern.gsub!('B', '[DN]')
+ @pattern.gsub!('Z', '[EQ]')
+ @pattern.gsub!('X', '.')
+ when 'dna'
+ @pattern.gsub!('R', '[AG]')
+ @pattern.gsub!('Y', '[CT]')
+ @pattern.gsub!('S', '[GC]')
+ @pattern.gsub!('W', '[AT]')
+ @pattern.gsub!('M', '[AC]')
+ @pattern.gsub!('K', '[GT]')
+ @pattern.gsub!('V', '[ACG]')
+ @pattern.gsub!('H', '[ACT]')
+ @pattern.gsub!('D', '[AGT]')
+ @pattern.gsub!('B', '[CGT]')
+ @pattern.gsub!('N', '.')
+ when 'rna'
+ else
+ raise SeqError "unknown sequence type: #{self.type}"
+ end
+ end
+
+ class Match
+ attr_reader :pos, :length, :match
+
+ def initialize(pos, length, match)
+ @pos = pos
+ @length = length
+ @match = match
+ end
+ end
+end
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
require 'digest'
+#require 'patscan'
require 'patternmatcher'
# Residue alphabets
class SeqError < StandardError; end
class Seq
+ #include Patscan
include PatternMatcher
attr_accessor :seq_name, :seq, :type, :qual
((self.seq.scan(/[a-z]/).size.to_f / (self.len - self.indels).to_f) * 100).round(2)
end
- # Method that finds an adaptor or part thereof in the sequence of a Seq object.
- # Returns a Match object if the adaptor was found otherwise nil. The ed_percent
- # indicates the maximum edit distance allowed in all possible overlaps.
- def adaptor_find(adaptor, ed_percent = 0)
- raise SeqError, "Edit distance percent out of range #{ed_percent}" unless (0 .. 100).include? ed_percent
-
- pos = 0
-
- while adaptor.length > 0
- ed_max = (adaptor.length * ed_percent * 0.01).round
-
- match = self.match(adaptor, pos, ed_max)
-
- return match unless match.nil?
-
- adaptor = adaptor[0 ... -1]
-
- pos = self.len - adaptor.length
- end
- end
-
- # Method that locates an adaptor or part thereof in the sequence
- # of a Seq object beginning from the right and removes the adaptor
- # sequence if found. The hd_percent is used to calculate the
- # maximum hamming distance allowed for all possible overlaps.
- def adaptor_clip_right(adaptor, hd_percent = 0)
- pos = self.adaptor_locate_right(adaptor, hd_percent)
-
- if pos > 0
- self.seq = self.seq[0 ... pos]
- self.qual = self.qual[0 ... pos] unless self.qual.nil?
- end
- end
-
# Method to convert the quality scores from a specified base
# to another base.
def convert_phred2illumina!
@entry.seq = "--AAAa"
assert_equal(25.00, @entry.soft_mask)
end
-
- def test_Seq_adaptor_find_with_bad_ed_percent_raises
- @entry.seq = "actagctagctacgtacg"
- assert_raise(SeqError) { @entry.adaptor_find("tacg", ed_percent = -1) }
- assert_raise(SeqError) { @entry.adaptor_find("tacg", ed_percent = 101) }
- end
-
- def test_Seq_adaptor_find_with_ok_ed_percent_dont_raise
- @entry.seq = "actagctagctacgtacg"
- assert_nothing_raised { @entry.adaptor_find("tacg", ed_percent = 0) }
- assert_nothing_raised { @entry.adaptor_find("tacg", ed_percent = 100) }
- end
-
- def test_Seq_adaptor_find_with_no_match_returns_nil
- @entry.seq = "actaaggctagctacgtccg"
- assert_nil(@entry.adaptor_find("TTTT"))
- end
-
- def test_Seq_adaptor_find_returns_correct_match
- @entry.seq = "actaaggctagctacgtccg"
- assert_equal(0, @entry.adaptor_find("actaa").pos)
- assert_equal(7, @entry.adaptor_find("ctagc").pos)
- assert_equal(15, @entry.adaptor_find("gtccg").pos)
- assert_equal(17, @entry.adaptor_find("ccgTT").pos)
- assert_equal(18, @entry.adaptor_find("cgTTT").pos)
- assert_equal(19, @entry.adaptor_find("gTTTT").pos)
- end
-
- def test_Seq_adaptor_with_mis_and_ed_percent_returns_correct_match
- @entry.seq = "actaaggctagctacgtccg"
- assert_equal(0, @entry.adaptor_find("GGGaag", ed_percent = 50).pos)
- assert_equal(14, @entry.adaptor_find("cgtcTTTT", ed_percent = 50).pos)
- end
-
- def test_Seq_adaptor_with_ins_and_ed_percent_returns_correct_match
- @entry.seq = "actaaggctagctacgtccg"
- assert_equal(0, @entry.adaptor_find("actGGGaag", ed_percent = 50).pos)
- assert_equal(15, @entry.adaptor_find("gtAccgTTTTT", ed_percent = 10).pos)
- end
-
- def test_Seq_adaptor_with_del_and_ed_percent_returns_correct_match
- @entry.seq = "actaaggctagctacgtccg"
- assert_equal(0, @entry.adaptor_find("actctag", ed_percent = 50).pos)
- end
end