3 # Copyright (C) 2007-2012 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
21 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
23 # This program is part of the Biopieces framework (www.biopieces.org).
25 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
27 # Remove adaptors or parts thereof from sequences in the stream.
29 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
33 require 'maasha/biopieces'
34 require 'maasha/fasta'
36 # Error class for PatScan errors.
37 class PatScanError < StandardError; end;
40 def initialize(tmpdir, cpus)
43 @files_fasta = Dir.glob(File.join(@tmpdir, "*.fna"))
47 file_pat = File.join(@tmpdir, "pat.txt")
49 File.open(file_pat, 'w') do |ios|
57 @files_fasta.each do |file_fasta|
58 Thread.pass while child_count >= @cpus
59 ch_mutex.synchronize { child_count += 1 }
61 threads << Thread.new do
62 command = command_compile(file_pat, file_fasta)
64 raise PatScanError, "Command failed: #{command}" unless $?.success?
65 ch_mutex.synchronize { child_count -= 1 }
69 threads.each { |t| t.join }
73 files_result = Dir.glob(File.join(@tmpdir, "*.out"))
77 files_result.each do |file|
78 Fasta.open(file, 'r') do |ios|
80 if entry.seq_name =~ /^(\d+):\[(\d+),(\d+)\]$/
84 matches[name] = [start, stop - start + 1] unless matches[name]
86 raise "Failed to parse sequence name: #{entry.seq_name}"
96 Dir.glob(File.join(@tmpdir, "*.out")).each do |file|
103 def command_compile(file_pat, file_fasta)
105 commands << "nice -n 19"
106 commands << "scan_for_matches"
108 commands << "< #{file_fasta}"
109 commands << "> #{file_fasta}.out"
110 command = commands.join(" ")
114 # Error class for Pattern errors.
115 class PatternError < StandardError; end;
118 def self.create_left(adaptor, misp, insp, delp, len)
121 (adaptor.length).downto(len) do |i|
122 pattern = adaptor[adaptor.length - i ... adaptor.length]
123 mis = (pattern.length * misp * 0.01).round
124 ins = (pattern.length * insp * 0.01).round
125 del = (pattern.length * delp * 0.01).round
127 if i == adaptor.length
128 patterns << "#{pattern}[#{mis},#{ins},#{del}]"
130 patterns << "^ #{pattern}[#{mis},#{ins},#{del}]"
137 def self.create_right(adaptor, misp, insp, delp, len)
140 (adaptor.length).downto(len) do |i|
141 pattern = adaptor[0 ... i]
142 mis = (pattern.length * misp * 0.01).round
143 ins = (pattern.length * insp * 0.01).round
144 del = (pattern.length * delp * 0.01).round
146 if i == adaptor.length
147 patterns << "#{pattern}[#{mis},#{ins},#{del}]"
149 patterns << "#{pattern}[#{mis},#{ins},#{del}] $"
158 def self.compile(patterns)
161 while patterns.size > 1
162 new_patterns = patterns[0 ... -2]
163 new_patterns << "( #{patterns[-2 .. -1].join(' | ')} )"
165 patterns = new_patterns
173 casts << {:long=>'forward', :short=>'f', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
174 casts << {:long=>'forward_rc', :short=>'F', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
175 casts << {:long=>'reverse', :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
176 casts << {:long=>'reverse_rc', :short=>'R', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
177 casts << {:long=>'len_forward', :short=>'l', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'}
178 casts << {:long=>'len_reverse', :short=>'L', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'}
179 casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil}
180 casts << {:long=>'insertions', :short=>'i', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil}
181 casts << {:long=>'deletions', :short=>'d', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil}
182 casts << {:long=>'cpus', :short=>'c', :type=>'uint', :mandatory=>false, :default=>1, :allowed=>nil, :disallowed=>'0'}
183 casts << {:long=>'bases_per_file',:short=>'b', :type=>'uint', :mandatory=>false, :default=>10_000_000, :allowed=>nil, :disallowed=>'0'}
185 options = Biopieces.options_parse(ARGV, casts)
187 if options[:forward_rc]
188 options[:forward] = Seq.new("test", options[:forward_rc], 'dna').revcomp.seq
191 if options[:reverse_rc]
192 options[:reverse] = Seq.new("test", options[:reverse_rc], 'dna').revcomp.seq
195 raise ArgumentError, "no adaptor specified" unless options[:forward] or options[:reverse]
198 options[:len_forward] = options[:forward].length unless options[:len_forward]
200 if options[:len_forward] > options[:forward].length
201 raise ArgumentError, "len_forward > forward adaptor (#{options[:len_forward]} > #{options[:forward].length})"
206 options[:len_reverse] = options[:reverse].length unless options[:len_reverse]
208 if options[:len_reverse] > options[:reverse].length
209 raise ArgumentError, "len_reverse > reverse adaptor (#{options[:len_reverse]} > #{options[:reverse].length})"
213 tmpdir = Biopieces.mktmpdir
214 file_records = File.join(tmpdir, "data.stream")
220 Biopieces.open(options[:stream_in], file_records) do |input, output|
221 file_fasta = File.join(tmpdir, "#{count_file}.fna")
222 out_fa = Fasta.open(file_fasta, 'w')
224 input.each do |record|
227 if record[:SEQ] and record[:SEQ].length > 0
228 record[:SEQ_NAME] = count_seq
230 seq = Seq.new_bp(record)
232 out_fa.puts seq.to_fasta
235 bases += record[:SEQ].length
237 if bases > options[:bases_per_file]
241 file_fasta = File.join(tmpdir, "#{count_file}.fna")
242 out_fa = Fasta.open(file_fasta, 'w')
247 out_fa.close if out_fa.respond_to? :close
250 patscan = PatScan.new(tmpdir, options[:cpus])
255 pat_left = Pattern.create_left(
257 options[:mismatches],
258 options[:insertions],
260 options[:len_forward])
262 patscan.run(pat_left)
263 matches_left = patscan.parse_results
268 pat_right = Pattern.create_right(
270 options[:mismatches],
271 options[:insertions],
273 options[:len_reverse])
275 patscan.run(pat_right)
276 matches_right = patscan.parse_results
281 Biopieces.open(file_records, options[:stream_out]) do |input, output|
282 input.each_record do |record|
284 if matches_left[count_seq]
285 record[:ADAPTOR_POS_LEFT] = matches_left[count_seq].first
286 record[:ADAPTOR_LEN_LEFT] = matches_left[count_seq].last
287 record[:ADAPTOR_PAT_LEFT] = record[:SEQ][record[:ADAPTOR_POS_LEFT] ... record[:ADAPTOR_POS_LEFT] + record[:ADAPTOR_LEN_LEFT]]
290 if matches_right[count_seq]
291 record[:ADAPTOR_POS_RIGHT] = matches_right[count_seq].first
292 record[:ADAPTOR_LEN_RIGHT] = matches_right[count_seq].last
293 record[:ADAPTOR_PAT_RIGHT] = record[:SEQ][record[:ADAPTOR_POS_RIGHT] ... record[:ADAPTOR_POS_RIGHT] + record[:ADAPTOR_LEN_RIGHT]]
304 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<