1 # Copyright (C) 2007-2011 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 require 'maasha/filesys'
28 # Module containing code to locate nucleotide patterns in sequences allowing for
29 # ambiguity codes and a given maximum mismatches, insertions, and deletions. The
30 # pattern match engine is based on scan_for_matches.
32 # ------------------------------------------------------------------------------
33 # str.scan(pattern[, max_mismatches [, max_insertions [,max_deletions]]])
35 # str.scan(pattern[, max_mismatches [, max_insertions [,max_deletions]]]) { |match|
40 # ------------------------------------------------------------------------------
41 # Method to iterate through a sequence to locate pattern matches starting
42 # allowing for a maximum number of mismatches, insertions, and deletions.
43 # Matches found in block context return the Match object. Otherwise matches are
44 # returned in an Array.
45 def scan(pattern, max_mismatches = 0, max_insertions = 0, max_deletions = 0)
46 @pattern = pattern + "[#{max_mismatches},#{max_insertions},#{max_deletions}]"
49 pattern_file = Filesys.tmpfile
51 File.open(pattern_file, mode = 'w') do |ios|
55 cmd = "scan_for_matches #{pattern_file}"
57 Open3.popen3(cmd) do |stdin, stdout, stderr|
58 #raise SeqError, "scan_for_matches failed: #{stderr.gets}" unless stderr.empty?
60 stdin << ">dummy\n#{self.seq}\n"
64 while block = stdout.gets($/ + ">")
65 if block =~ />dummy:\[(\d+),(\d+)\]\n(.+)/
68 match = $3.delete(" ")
72 yield Match.new(start, length, match)
74 matches << Match.new(start, length, match)
80 matches unless block_given?
84 attr_reader :pos, :length, :match
86 def initialize(pos, length, match)
97 # Eeeeiiiii - this one is slower than scan_for_matches!
100 # Module containing code to locate nucleotide patterns in sequences allowing for
101 # ambiguity codes and a given maximum edit distance. Pattern match engine is based
104 # ------------------------------------------------------------------------------
105 # str.scan(pattern[, max_edit_distance])
107 # str.scan(pattern[, max_edit_distance]) { |match|
112 # ------------------------------------------------------------------------------
113 # Method to iterate through a sequence to locate pattern matches starting
114 # from a given position and allowing for a maximum edit distance.
115 # Matches found in block context return the Match object. Otherwise matches are
116 # returned in an Array.
117 def scan(pattern, edit_distance = 0)
118 @pattern = pattern.upcase
123 cmd = "nrgrep_coords"
125 cmd << " -k #{edit_distance}s" if edit_distance > 0
126 cmd << " #{@pattern}"
128 Open3.popen3(cmd) do |stdin, stdout, stderr|
133 stdout.each_line do |line|
134 if line =~ /\[(\d+), (\d+)\]: (.+)/
138 length = stop - start
141 yield Match.new(start, length, match)
143 matches << Match.new(start, length, match)
149 matches unless block_given?
154 def pattern_disambiguate
157 @pattern.gsub!('J', '[IFVLWMAGCY]')
158 @pattern.gsub!('O', '[TSHEDQNKR]')
159 @pattern.gsub!('B', '[DN]')
160 @pattern.gsub!('Z', '[EQ]')
161 @pattern.gsub!('X', '.')
163 @pattern.gsub!('R', '[AG]')
164 @pattern.gsub!('Y', '[CT]')
165 @pattern.gsub!('S', '[GC]')
166 @pattern.gsub!('W', '[AT]')
167 @pattern.gsub!('M', '[AC]')
168 @pattern.gsub!('K', '[GT]')
169 @pattern.gsub!('V', '[ACG]')
170 @pattern.gsub!('H', '[ACT]')
171 @pattern.gsub!('D', '[AGT]')
172 @pattern.gsub!('B', '[CGT]')
173 @pattern.gsub!('N', '.')
176 raise SeqError "unknown sequence type: #{self.type}"
181 attr_reader :pos, :length, :match
183 def initialize(pos, length, match)