3 # Copyright (C) 2007-2011 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
21 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
23 # This program is part of the Biopieces framework (www.biopieces.org).
25 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
27 # Remove adaptors or parts thereof from sequences in the stream.
29 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
33 require 'maasha/biopieces'
34 require 'maasha/fasta'
36 # Error class for PatScan errors.
37 class PatScanError < StandardError; end;
40 def initialize(options, tmpdir, file_pattern, file_patscan, cpus)
42 @file_pattern = file_pattern
43 @file_patscan = file_patscan
45 @files_fasta = Dir.glob(File.join(tmpdir, "*.fna"))
47 pat = Pattern.new(@options)
48 pat.write(@file_pattern)
54 # @files_fasta.each do |file|
56 # Process.wait if ( child_count += 1 ) >= @cpus
58 # command = command_compile(file)
60 # raise PatScanError, "Command failed: #{command}" unless $?.success?
69 @files_fasta.each do |file|
70 Thread.pass while child_count >= @cpus
74 command = command_compile(file)
76 raise PatScanError, "Command failed: #{command}" unless $?.success?
84 # ch_mutex = Mutex.new
87 # @files_fasta.each do |file|
88 # Thread.pass while child_count >= @cpus
89 # ch_mutex.synchronize { child_count += 1 }
91 # threads << Thread.new do
92 # command = command_compile(file)
94 # raise PatScanError, "Command failed: #{command}" unless $?.success?
95 # ch_mutex.synchronize { child_count -= 1 }
99 # threads.each { |t| t.join }
102 def command_compile(file)
104 commands << "nice -n 19"
105 commands << "scan_for_matches"
106 commands << @file_pattern
107 commands << "< #{file}"
108 commands << "> #{file}.out"
109 command = commands.join(" ")
115 Fasta.open(@file_patscan, mode='r') do |ios|
117 if entry.seq_name =~ /^(\d+):\[(\d+),(\d+)\]$/
121 matches[name] = [start, stop - start + 1] unless matches.has_key? name
123 raise "Failed to parse sequence name: #{entry.seq_name}"
132 # Error class for Pattern errors.
133 class PatternError < StandardError; end;
136 def initialize(options)
139 @patterns << pattern_internal
140 @patterns += patterns_end if @options[:partial]
146 while @patterns.size > 1
147 new_patterns = @patterns[0 ... -2]
148 new_patterns << "( #{@patterns[-2 .. -1].join(' | ')} )"
150 @patterns = new_patterns
157 File.open(file, mode='w') do |ios|
165 pattern = @options[:adaptor]
166 mis = mis_count(pattern)
167 ins = ins_count(pattern)
168 del = del_count(pattern)
170 "#{pattern}[#{mis},#{ins},#{del}]"
175 adaptor = @options[:adaptor]
177 raise PatternError, "len > adaptor length: #{@options[:len]} > #{adaptor.length - 1}" if @options[:len] > adaptor.length - 1
179 (adaptor.length - 1).downto(@options[:len]) do |i|
180 pattern = adaptor[0 ... i]
181 mis = mis_count(pattern)
182 ins = ins_count(pattern)
183 del = del_count(pattern)
184 patterns << "#{pattern}[#{mis},#{ins},#{del}] $"
190 def mis_count(pattern)
191 (pattern.length * @options[:mismatches] * 0.01).round
194 def ins_count(pattern)
195 (pattern.length * @options[:insertions] * 0.01).round
198 def del_count(pattern)
199 (pattern.length * @options[:deletions] * 0.01).round
204 casts << {:long=>'adaptor', :short=>'a', :type=>'string', :mandatory=>true, :default=>nil, :allowed=>nil, :disallowed=>nil}
205 casts << {:long=>'partial', :short=>'p', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
206 casts << {:long=>'len', :short=>'l', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>'0'}
207 casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil}
208 casts << {:long=>'insertions', :short=>'i', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil}
209 casts << {:long=>'deletions', :short=>'d', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil}
210 casts << {:long=>'cpus', :short=>'c', :type=>'uint', :mandatory=>false, :default=>1, :allowed=>nil, :disallowed=>'0'}
212 BASE_PER_FILE = 10_000_000
214 options = Biopieces.options_parse(ARGV, casts)
216 #tmpdir = Biopieces.mktmpdir
217 tmpdir = "Tyt" # DEBUG TODO
218 file_records = File.join(tmpdir, "data.stream")
219 file_pattern = File.join(tmpdir, "pattern.txt")
220 file_patscan = File.join(tmpdir, "patscan.out")
226 Biopieces.open(options[:stream_in], file_records) do |input, output|
227 file_fasta = File.join(tmpdir, "#{number_file}.fna")
228 out_fa = Fasta.open(file_fasta, mode='w')
230 input.each do |record|
233 if record.has_key? :SEQ
234 record[:SEQ_NAME] = number_seq
238 bases += record[:SEQ].length
240 if bases > BASE_PER_FILE
244 file_fasta = File.join(tmpdir, "#{number_file}.fna")
245 out_fa = Fasta.open(file_fasta, mode='w')
250 out_fa.close if out_fa.respond_to? :close
253 patscan = PatScan.new(options, tmpdir, file_pattern, file_patscan, options[:cpus])
256 matches = patscan.parse_results
260 Biopieces.open(file_records, options[:stream_out]) do |input, output|
261 input.each_record do |record|
262 if record.has_key? :SEQ
263 if matches.has_key? number_seq
264 record[:ADAPTOR_POS] = matches[number_seq].first
265 record[:ADAPTOR_LEN] = matches[number_seq].last
276 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<