X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bp_bin%2Ffind_adaptor;h=290af5a19dfa5da0614f0ea911ec1c6c636bf63c;hb=5d888119b81849888096e08a4812d1477ff67d4e;hp=53e453a147fd62414bfa1e4a9af96e8f27bc2326;hpb=4b299e55753c044bb6f80094a1748bf94416d749;p=biopieces.git diff --git a/bp_bin/find_adaptor b/bp_bin/find_adaptor index 53e453a..290af5a 100755 --- a/bp_bin/find_adaptor +++ b/bp_bin/find_adaptor @@ -37,40 +37,63 @@ require 'maasha/fasta' class PatScanError < StandardError; end; class PatScan - def initialize(options, file_fasta, file_pattern, file_patscan) + def initialize(options, tmpdir, file_pattern, cpus) @options = options - @file_fasta = file_fasta + @tmpdir = tmpdir @file_pattern = file_pattern - @file_patscan = file_patscan + @cpus = cpus + @files_fasta = Dir.glob(File.join(@tmpdir, "*.fna")) pat = Pattern.new(@options) pat.write(@file_pattern) end def run + child_count = 0 + ch_mutex = Mutex.new + threads = [] + + @files_fasta.each do |file| + Thread.pass while child_count >= @cpus + ch_mutex.synchronize { child_count += 1 } + + threads << Thread.new do + command = command_compile(file) + system(command) + raise PatScanError, "Command failed: #{command}" unless $?.success? + ch_mutex.synchronize { child_count -= 1 } + end + end + + threads.each { |t| t.join } + end + + def command_compile(file) commands = [] commands << "nice -n 19" commands << "scan_for_matches" commands << @file_pattern - commands << "< #{@file_fasta}" - commands << "> #{@file_patscan}" + commands << "< #{file}" + commands << "> #{file}.out" command = commands.join(" ") - system(command) - raise PatScanError, "Command failed: #{command}" unless $?.success? end def parse_results + files_result = Dir.glob(File.join(@tmpdir, "*.out")) + matches = {} - Fasta.open(@file_patscan, mode='r') do |ios| - ios.each_entry do |entry| - if entry.seq_name =~ /^(\d+):\[(\d+),(\d+)\]$/ - name = $1.to_i - start = $2.to_i - 1 - stop = $3.to_i - 1 - matches[name] = [start, stop - start + 1] - else - raise "Failed to parse sequence name: #{entry.seq_name}" + files_result.each do |file| + Fasta.open(file, 'r') do |ios| + ios.each do |entry| + if entry.seq_name =~ /^(\d+):\[(\d+),(\d+)\]$/ + name = $1.to_i + start = $2.to_i - 1 + stop = $3.to_i - 1 + matches[name] = [start, stop - start + 1] unless matches.has_key? name + else + raise "Failed to parse sequence name: #{entry.seq_name}" + end end end end @@ -87,7 +110,7 @@ class Pattern @options = options @patterns = [] @patterns << pattern_internal - @patterns += patterns_end if @options[:trim_end] + @patterns += patterns_end if @options[:partial] end def to_i @@ -104,7 +127,7 @@ class Pattern end def write(file) - File.open(file, mode='w') do |ios| + File.open(file, 'w') do |ios| ios.puts self.to_i end end @@ -124,9 +147,9 @@ class Pattern patterns = [] adaptor = @options[:adaptor] - raise PatternError, "trim_end_min > adaptor length: #{@options[:trim_end_min]} > #{adaptor.length - 1}" if @options[:trim_end_min] > adaptor.length - 1 + raise PatternError, "len > adaptor length: #{@options[:len]} > #{adaptor.length - 1}" if @options[:len] > adaptor.length - 1 - (adaptor.length - 1).downto(@options[:trim_end_min]) do |i| + (adaptor.length - 1).downto(@options[:len]) do |i| pattern = adaptor[0 ... i] mis = mis_count(pattern) ins = ins_count(pattern) @@ -151,53 +174,71 @@ class Pattern end casts = [] -casts << {:long=>'adaptor', :short=>'a', :type=>'string', :mandatory=>true, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'trim_end', :short=>'t', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'trim_end_min', :short=>'l', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>'0'} -casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'insertions', :short=>'i', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'deletions', :short=>'d', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'adaptor', :short=>'a', :type=>'string', :mandatory=>true, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'partial', :short=>'p', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'len', :short=>'l', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>'0'} +casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'insertions', :short=>'i', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'deletions', :short=>'d', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'cpus', :short=>'c', :type=>'uint', :mandatory=>false, :default=>1, :allowed=>nil, :disallowed=>'0'} + +BASE_PER_FILE = 10_000_000 options = Biopieces.options_parse(ARGV, casts) tmpdir = Biopieces.mktmpdir -file_fasta = File.join(tmpdir, "data.fna") file_records = File.join(tmpdir, "data.stream") file_pattern = File.join(tmpdir, "pattern.txt") -file_patscan = File.join(tmpdir, "patscan.fna") -count = 0 +number_file = 0 +number_seq = 0 +bases = 0 Biopieces.open(options[:stream_in], file_records) do |input, output| - Fasta.open(file_fasta, mode='w') do |out_fa| - input.each do |record| - output.puts record + file_fasta = File.join(tmpdir, "#{number_file}.fna") + out_fa = Fasta.open(file_fasta, 'w') - if record.has_key? :SEQ - record[:SEQ_NAME] = count - out_fa.puts record + input.each do |record| + output.puts record + + if record.has_key? :SEQ + record[:SEQ_NAME] = number_seq + + seq = Seq.new_bp(record) - count += 1; + out_fa.puts seq.to_fasta + + number_seq += 1; + bases += record[:SEQ].length + + if bases > BASE_PER_FILE + out_fa.close + bases = 0 + number_file += 1 + file_fasta = File.join(tmpdir, "#{number_file}.fna") + out_fa = Fasta.open(file_fasta, 'w') end end end + + out_fa.close if out_fa.respond_to? :close end -patscan = PatScan.new(options, file_fasta, file_pattern, file_patscan) +patscan = PatScan.new(options, tmpdir, file_pattern, options[:cpus]) patscan.run matches = patscan.parse_results -count = 0 +number_seq = 0 Biopieces.open(file_records, options[:stream_out]) do |input, output| input.each_record do |record| if record.has_key? :SEQ - if matches.has_key? count - record[:ADAPTOR_POS] = matches[count].first - record[:ADAPTOR_LEN] = matches[count].last + if matches.has_key? number_seq + record[:ADAPTOR_POS] = matches[number_seq].first + record[:ADAPTOR_LEN] = matches[number_seq].last end - count += 1; + number_seq += 1; end output.puts record