From: martinahansen Date: Mon, 10 Dec 2012 18:34:35 +0000 (+0000) Subject: unrolling new find_adaptor X-Git-Url: https://git.donarmstrong.com/?p=biopieces.git;a=commitdiff_plain;h=a8af1d203619b49d9633a5dd7fe4715345251896 unrolling new find_adaptor git-svn-id: http://biopieces.googlecode.com/svn/trunk@2039 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/bp_bin/find_adaptor b/bp_bin/find_adaptor index bf4f372..5b2c96b 100755 --- a/bp_bin/find_adaptor +++ b/bp_bin/find_adaptor @@ -28,159 +28,27 @@ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - require 'pp' require 'maasha/biopieces' -require 'maasha/fasta' - -# Error class for PatScan errors. -class PatScanError < StandardError; end; - -class PatScan - def initialize(tmpdir, cpus) - @tmpdir = tmpdir - @cpus = cpus - @files_fasta = Dir.glob(File.join(@tmpdir, "*.fna")) - end - - def run(pattern) - file_pat = File.join(@tmpdir, "pat.txt") - - File.open(file_pat, 'w') do |ios| - ios.puts pattern - end - - child_count = 0 - ch_mutex = Mutex.new - threads = [] - - @files_fasta.each do |file_fasta| - Thread.pass while child_count >= @cpus - ch_mutex.synchronize { child_count += 1 } - - threads << Thread.new do - command = command_compile(file_pat, file_fasta) - system(command) - raise PatScanError, "Command failed: #{command}" unless $?.success? - ch_mutex.synchronize { child_count -= 1 } - end - end - - threads.each { |t| t.join } - end - - def parse_results - files_result = Dir.glob(File.join(@tmpdir, "*.out")) - - matches = {} - - files_result.each do |file| - Fasta.open(file, 'r') do |ios| - ios.each do |entry| - if entry.seq_name =~ /^(\d+):\[(\d+),(\d+)\]$/ - name = $1.to_i - start = $2.to_i - 1 - stop = $3.to_i - 1 - matches[name] = [start, stop - start + 1] unless matches[name] - else - raise "Failed to parse sequence name: #{entry.seq_name}" - end - end - end - end +require 'maasha/seq' +require 'maasha/seq/backtrack' - matches - end - - def clean - Dir.glob(File.join(@tmpdir, "*.out")).each do |file| - File.unlink(file) - end - end - - private - - def command_compile(file_pat, file_fasta) - commands = [] - commands << "nice -n 19" - commands << "scan_for_matches" - commands << file_pat - commands << "< #{file_fasta}" - commands << "> #{file_fasta}.out" - command = commands.join(" ") - end +def percent2real(length, percent) + (length * percent * 0.01).round end -# Error class for Pattern errors. -class PatternError < StandardError; end; - -class Pattern - def self.create_left(adaptor, misp, insp, delp, len) - patterns = [] - - (adaptor.length).downto(len) do |i| - pattern = adaptor[adaptor.length - i ... adaptor.length] - mis = (pattern.length * misp * 0.01).round - ins = (pattern.length * insp * 0.01).round - del = (pattern.length * delp * 0.01).round - - if i == adaptor.length - patterns << "#{pattern}[#{mis},#{ins},#{del}]" - else - patterns << "^ #{pattern}[#{mis},#{ins},#{del}]" - end - end - - compile(patterns) - end - - def self.create_right(adaptor, misp, insp, delp, len) - patterns = [] - - (adaptor.length).downto(len) do |i| - pattern = adaptor[0 ... i] - mis = (pattern.length * misp * 0.01).round - ins = (pattern.length * insp * 0.01).round - del = (pattern.length * delp * 0.01).round - - if i == adaptor.length - patterns << "#{pattern}[#{mis},#{ins},#{del}]" - else - patterns << "#{pattern}[#{mis},#{ins},#{del}] $" - end - end - - compile(patterns) - end - - private - - def self.compile(patterns) - new_patterns = [] - - while patterns.size > 1 - new_patterns = patterns[0 ... -2] - new_patterns << "( #{patterns[-2 .. -1].join(' | ')} )" - - patterns = new_patterns - end - - patterns.first - end -end +include BackTrack casts = [] -casts << {:long=>'forward', :short=>'f', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'forward_rc', :short=>'F', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'reverse', :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'reverse_rc', :short=>'R', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'len_forward', :short=>'l', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'} -casts << {:long=>'len_reverse', :short=>'L', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'} -casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'insertions', :short=>'i', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'deletions', :short=>'d', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'cpus', :short=>'c', :type=>'uint', :mandatory=>false, :default=>1, :allowed=>nil, :disallowed=>'0'} -casts << {:long=>'bases_per_file',:short=>'b', :type=>'uint', :mandatory=>false, :default=>10_000_000, :allowed=>nil, :disallowed=>'0'} +casts << {:long=>'forward', :short=>'f', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'forward_rc', :short=>'F', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'reverse', :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'reverse_rc', :short=>'R', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'len_forward', :short=>'l', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'} +casts << {:long=>'len_reverse', :short=>'L', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'} +casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'insertions', :short=>'i', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'deletions', :short=>'d', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} options = Biopieces.options_parse(ARGV, casts) @@ -200,6 +68,10 @@ if options[:forward] if options[:len_forward] > options[:forward].length raise ArgumentError, "len_forward > forward adaptor (#{options[:len_forward]} > #{options[:forward].length})" end + + fmis = percent2real(options[:forward].length, options[:mismatches]) + fins = percent2real(options[:forward].length, options[:insertions]) + fdel = percent2real(options[:forward].length, options[:deletions]) end if options[:reverse] @@ -208,99 +80,81 @@ if options[:reverse] if options[:len_reverse] > options[:reverse].length raise ArgumentError, "len_reverse > reverse adaptor (#{options[:len_reverse]} > #{options[:reverse].length})" end -end - -tmpdir = Biopieces.mktmpdir -file_records = File.join(tmpdir, "data.stream") - -count_file = 0 -count_seq = 0 -bases = 0 -Biopieces.open(options[:stream_in], file_records) do |input, output| - file_fasta = File.join(tmpdir, "#{count_file}.fna") - out_fa = Fasta.open(file_fasta, 'w') + rmis = percent2real(options[:reverse].length, options[:mismatches]) + rins = percent2real(options[:reverse].length, options[:insertions]) + rdel = percent2real(options[:reverse].length, options[:deletions]) +end +Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| input.each do |record| - output.puts record - if record[:SEQ] and record[:SEQ].length > 0 - record[:SEQ_NAME] = count_seq - - seq = Seq.new_bp(record) - - out_fa.puts seq.to_fasta - - count_seq += 1; - bases += record[:SEQ].length - - if bases > options[:bases_per_file] - out_fa.close - bases = 0 - count_file += 1 - file_fasta = File.join(tmpdir, "#{count_file}.fna") - out_fa = Fasta.open(file_fasta, 'w') + entry = Seq.new_bp(record) + + if options[:forward] + if m = entry.patmatch(options[:forward], 0, fmis, fins, fdel) + record[:ADAPTOR_POS_LEFT] = m.pos + record[:ADAPTOR_LEN_LEFT] = m.length + record[:ADAPTOR_PAT_LEFT] = m.match + elsif options[:len_forward] < options[:forward].length + len = options[:forward].length - 1 + pat = options[:forward] + + while len >= options[:len_forward] + fmis = percent2real(len, options[:mismatches]) + fins = percent2real(len, options[:insertions]) + fdel = percent2real(len, options[:deletions]) + + pat = pat[1 ... pat.length] +$stderr.puts pat +$stderr.puts entry.seq[0 ... len] + if m = entry.patmatch(pat, [0, len], fmis, fins, fdel) + record[:ADAPTOR_POS_LEFT] = m.pos + record[:ADAPTOR_LEN_LEFT] = m.length + record[:ADAPTOR_PAT_LEFT] = m.match + + break + end + + len -= 1 + end + end end - end - end - - out_fa.close if out_fa.respond_to? :close -end -patscan = PatScan.new(tmpdir, options[:cpus]) -matches_left = {} -matches_right = {} + if options[:reverse] + if m = entry.patmatch(options[:reverse], 0, rmis, rins, rdel) + record[:ADAPTOR_POS_RIGHT] = m.pos + record[:ADAPTOR_LEN_RIGHT] = m.length + record[:ADAPTOR_PAT_RIGHT] = m.match + elsif options[:len_reverse] < options[:reverse].length + len = options[:reverse].length - 1 + pat = options[:reverse] -if options[:forward] - pat_left = Pattern.create_left( - options[:forward], - options[:mismatches], - options[:insertions], - options[:deletions], - options[:len_forward]) - - patscan.run(pat_left) - matches_left = patscan.parse_results -end + while len >= options[:len_reverse] + rmis = percent2real(len, options[:mismatches]) + rins = percent2real(len, options[:insertions]) + rdel = percent2real(len, options[:deletions]) -if options[:reverse] - patscan.clean - pat_right = Pattern.create_right( - options[:reverse], - options[:mismatches], - options[:insertions], - options[:deletions], - options[:len_reverse]) - - patscan.run(pat_right) - matches_right = patscan.parse_results -end + pat = pat[0 ... pat.length - 1] -count_seq = 0 + if m = entry.patmatch(pat, entry.length - len, rmis, rins, rdel) + record[:ADAPTOR_POS_RIGHT] = m.pos + record[:ADAPTOR_LEN_RIGHT] = m.length + record[:ADAPTOR_PAT_RIGHT] = m.match -Biopieces.open(file_records, options[:stream_out]) do |input, output| - input.each_record do |record| - if record[:SEQ] - if matches_left[count_seq] - record[:ADAPTOR_POS_LEFT] = matches_left[count_seq].first - record[:ADAPTOR_LEN_LEFT] = matches_left[count_seq].last - record[:ADAPTOR_PAT_LEFT] = record[:SEQ][record[:ADAPTOR_POS_LEFT] ... record[:ADAPTOR_POS_LEFT] + record[:ADAPTOR_LEN_LEFT]] - end + break + end - if matches_right[count_seq] - record[:ADAPTOR_POS_RIGHT] = matches_right[count_seq].first - record[:ADAPTOR_LEN_RIGHT] = matches_right[count_seq].last - record[:ADAPTOR_PAT_RIGHT] = record[:SEQ][record[:ADAPTOR_POS_RIGHT] ... record[:ADAPTOR_POS_RIGHT] + record[:ADAPTOR_LEN_RIGHT]] + len -= 1 + end + end end - - count_seq += 1; end output.puts record end end - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<