X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bp_bin%2Ffind_adaptor;h=f2f593d1ce90f11087b04ca9d2abc8a58a40b264;hb=2f0fd91b461033529a4a72e161bd133252a22eb6;hp=78e633a34ca0365694022bdf310ed1ef354871f3;hpb=cf1ef27d93920bab503036f21bce0853ac40310a;p=biopieces.git diff --git a/bp_bin/find_adaptor b/bp_bin/find_adaptor index 78e633a..f2f593d 100755 --- a/bp_bin/find_adaptor +++ b/bp_bin/find_adaptor @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -# Copyright (C) 2007-2011 Martin A. Hansen. +# Copyright (C) 2007-2012 Martin A. Hansen. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -28,251 +28,132 @@ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - require 'pp' require 'maasha/biopieces' -require 'maasha/fasta' - -# Error class for PatScan errors. -class PatScanError < StandardError; end; - -class PatScan - def initialize(options, tmpdir, file_pattern, file_patscan, cpus) - @options = options - @file_pattern = file_pattern - @file_patscan = file_patscan - @cpus = cpus - @files_fasta = Dir.glob(File.join(tmpdir, "*.fna")) - - pat = Pattern.new(@options) - pat.write(@file_pattern) - end - -# def run -# child_count = 0 -# -# @files_fasta.each do |file| -# if fork -# Process.wait if ( child_count += 1 ) >= @cpus -# else -# command = command_compile(file) -# system(command) -# raise PatScanError, "Command failed: #{command}" unless $?.success? -# exit -# end -# end -# end - - def run - child_count = 0 - - @files_fasta.each do |file| - Thread.pass while child_count >= @cpus - child_count += 1 - - Thread.new do - command = command_compile(file) - system(command) - raise PatScanError, "Command failed: #{command}" unless $?.success? - child_count -= 1 - end - end - end - -# def run -# child_count = 0 -# ch_mutex = Mutex.new -# threads = [] -# -# @files_fasta.each do |file| -# Thread.pass while child_count >= @cpus -# ch_mutex.synchronize { child_count += 1 } -# -# threads << Thread.new do -# command = command_compile(file) -# system(command) -# raise PatScanError, "Command failed: #{command}" unless $?.success? -# ch_mutex.synchronize { child_count -= 1 } -# end -# end -# -# threads.each { |t| t.join } -# end - - def command_compile(file) - commands = [] - commands << "nice -n 19" - commands << "scan_for_matches" - commands << @file_pattern - commands << "< #{file}" - commands << "> #{file}.out" - command = commands.join(" ") - end +require 'maasha/seq' +require 'maasha/seq/backtrack' - def parse_results - matches = {} - - Fasta.open(@file_patscan, mode='r') do |ios| - ios.each do |entry| - if entry.seq_name =~ /^(\d+):\[(\d+),(\d+)\]$/ - name = $1.to_i - start = $2.to_i - 1 - stop = $3.to_i - 1 - matches[name] = [start, stop - start + 1] unless matches.has_key? name - else - raise "Failed to parse sequence name: #{entry.seq_name}" - end - end - end - - matches - end +def percent2real(length, percent) + (length * percent * 0.01).round end -# Error class for Pattern errors. -class PatternError < StandardError; end; - -class Pattern - def initialize(options) - @options = options - @patterns = [] - @patterns << pattern_internal - @patterns += patterns_end if @options[:partial] - end +include BackTrack - def to_i - new_patterns = [] - - while @patterns.size > 1 - new_patterns = @patterns[0 ... -2] - new_patterns << "( #{@patterns[-2 .. -1].join(' | ')} )" +casts = [] +casts << {:long=>'forward', :short=>'f', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'forward_rc', :short=>'F', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'reverse', :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'reverse_rc', :short=>'R', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'len_forward', :short=>'l', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'} +casts << {:long=>'len_reverse', :short=>'L', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'} +casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'insertions', :short=>'i', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'deletions', :short=>'d', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} - @patterns = new_patterns - end +options = Biopieces.options_parse(ARGV, casts) - @patterns.first - end +if options[:forward_rc] + options[:forward] = Seq.new("test", options[:forward_rc], :dna).reverse.complement.seq +end - def write(file) - File.open(file, mode='w') do |ios| - ios.puts self.to_i - end - end +if options[:reverse_rc] + options[:reverse] = Seq.new("test", options[:reverse_rc], :dna).reverse.complement.seq +end - private +raise ArgumentError, "no adaptor specified" unless options[:forward] or options[:reverse] - def pattern_internal - pattern = @options[:adaptor] - mis = mis_count(pattern) - ins = ins_count(pattern) - del = del_count(pattern) +if options[:forward] + options[:len_forward] = options[:forward].length unless options[:len_forward] - "#{pattern}[#{mis},#{ins},#{del}]" + if options[:len_forward] > options[:forward].length + raise ArgumentError, "len_forward > forward adaptor (#{options[:len_forward]} > #{options[:forward].length})" end - def patterns_end - patterns = [] - adaptor = @options[:adaptor] - - raise PatternError, "len > adaptor length: #{@options[:len]} > #{adaptor.length - 1}" if @options[:len] > adaptor.length - 1 - - (adaptor.length - 1).downto(@options[:len]) do |i| - pattern = adaptor[0 ... i] - mis = mis_count(pattern) - ins = ins_count(pattern) - del = del_count(pattern) - patterns << "#{pattern}[#{mis},#{ins},#{del}] $" - end - - patterns - end + fmis = percent2real(options[:forward].length, options[:mismatches]) + fins = percent2real(options[:forward].length, options[:insertions]) + fdel = percent2real(options[:forward].length, options[:deletions]) +end - def mis_count(pattern) - (pattern.length * @options[:mismatches] * 0.01).round - end +if options[:reverse] + options[:len_reverse] = options[:reverse].length unless options[:len_reverse] - def ins_count(pattern) - (pattern.length * @options[:insertions] * 0.01).round + if options[:len_reverse] > options[:reverse].length + raise ArgumentError, "len_reverse > reverse adaptor (#{options[:len_reverse]} > #{options[:reverse].length})" end - def del_count(pattern) - (pattern.length * @options[:deletions] * 0.01).round - end + rmis = percent2real(options[:reverse].length, options[:mismatches]) + rins = percent2real(options[:reverse].length, options[:insertions]) + rdel = percent2real(options[:reverse].length, options[:deletions]) end -casts = [] -casts << {:long=>'adaptor', :short=>'a', :type=>'string', :mandatory=>true, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'partial', :short=>'p', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'len', :short=>'l', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>'0'} -casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'insertions', :short=>'i', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'deletions', :short=>'d', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'cpus', :short=>'c', :type=>'uint', :mandatory=>false, :default=>1, :allowed=>nil, :disallowed=>'0'} - -BASE_PER_FILE = 10_000_000 - -options = Biopieces.options_parse(ARGV, casts) - -#tmpdir = Biopieces.mktmpdir -tmpdir = "Tyt" # DEBUG TODO -file_records = File.join(tmpdir, "data.stream") -file_pattern = File.join(tmpdir, "pattern.txt") -file_patscan = File.join(tmpdir, "patscan.out") - -number_file = 0 -number_seq = 0 -bases = 0 - -Biopieces.open(options[:stream_in], file_records) do |input, output| - file_fasta = File.join(tmpdir, "#{number_file}.fna") - out_fa = Fasta.open(file_fasta, mode='w') - +Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| input.each do |record| - output.puts record + if record[:SEQ] + entry = Seq.new_bp(record) + + if options[:forward] and record[:SEQ].length >= options[:forward].length + if m = entry.patmatch(options[:forward], 0, fmis, fins, fdel) + record[:ADAPTOR_POS_LEFT] = m.pos + record[:ADAPTOR_LEN_LEFT] = m.length + record[:ADAPTOR_PAT_LEFT] = m.match + elsif options[:len_forward] < options[:forward].length + len = options[:forward].length - 1 + pat = options[:forward] + + while len >= options[:len_forward] + fmis = percent2real(len, options[:mismatches]) + fins = percent2real(len, options[:insertions]) + fdel = percent2real(len, options[:deletions]) + + pat = pat[1 ... pat.length] + + if m = entry.patmatch(pat, [0, len], fmis, fins, fdel) + record[:ADAPTOR_POS_LEFT] = m.pos + record[:ADAPTOR_LEN_LEFT] = m.length + record[:ADAPTOR_PAT_LEFT] = m.match + + break + end + + len -= 1 + end + end + end - if record.has_key? :SEQ - record[:SEQ_NAME] = number_seq - out_fa.puts record + if options[:reverse] and record[:SEQ].length >= options[:reverse].length + if m = entry.patmatch(options[:reverse], 0, rmis, rins, rdel) + record[:ADAPTOR_POS_RIGHT] = m.pos + record[:ADAPTOR_LEN_RIGHT] = m.length + record[:ADAPTOR_PAT_RIGHT] = m.match + elsif options[:len_reverse] < options[:reverse].length + len = options[:reverse].length - 1 + pat = options[:reverse] - number_seq += 1; - bases += record[:SEQ].length + while len >= options[:len_reverse] + rmis = percent2real(len, options[:mismatches]) + rins = percent2real(len, options[:insertions]) + rdel = percent2real(len, options[:deletions]) - if bases > BASE_PER_FILE - out_fa.close - bases = 0 - number_file += 1 - file_fasta = File.join(tmpdir, "#{number_file}.fna") - out_fa = Fasta.open(file_fasta, mode='w') - end - end - end - - out_fa.close if out_fa.respond_to? :close -end + pat = pat[0 ... pat.length - 1] -patscan = PatScan.new(options, tmpdir, file_pattern, file_patscan, options[:cpus]) -patscan.run -exit -matches = patscan.parse_results + if m = entry.patmatch(pat, entry.length - len, rmis, rins, rdel) + record[:ADAPTOR_POS_RIGHT] = m.pos + record[:ADAPTOR_LEN_RIGHT] = m.length + record[:ADAPTOR_PAT_RIGHT] = m.match -number_seq = 0 + break + end -Biopieces.open(file_records, options[:stream_out]) do |input, output| - input.each_record do |record| - if record.has_key? :SEQ - if matches.has_key? number_seq - record[:ADAPTOR_POS] = matches[number_seq].first - record[:ADAPTOR_LEN] = matches[number_seq].last + len -= 1 + end + end end - - number_seq += 1; end output.puts record end end - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<