X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bp_bin%2Ffind_adaptor;h=0534572bd4ff08669f934a57d1d1bfc9a64d9486;hb=449502a619149d8b881cd0d1e7c2ba01b2ff88aa;hp=15371bdd03045ffdbd64159264c24a6d0eebc2aa;hpb=dc75d5a7960d8b3d4b315a19b5d187fd56e1eb75;p=biopieces.git diff --git a/bp_bin/find_adaptor b/bp_bin/find_adaptor index 15371bd..0534572 100755 --- a/bp_bin/find_adaptor +++ b/bp_bin/find_adaptor @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -# Copyright (C) 2007-2011 Martin A. Hansen. +# Copyright (C) 2007-2012 Martin A. Hansen. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -28,183 +28,132 @@ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - require 'pp' require 'maasha/biopieces' -require 'maasha/fasta' - -# Error class for PatScan errors. -class PatScanError < StandardError; end; - -class PatScan - def initialize(options, file_fasta, file_pattern, file_patscan) - @options = options - @file_fasta = file_fasta - @file_pattern = file_pattern - @file_patscan = file_patscan - - pat = Pattern.new(@options) - pat.write(@file_pattern) - end - - def run - commands = [] - commands << "nice -n 19" - commands << "scan_for_matches" - commands << @file_pattern - commands << "< #{@file_fasta}" - commands << "> #{@file_patscan}" - command = commands.join(" ") - system(command) - raise PatScanError, "Command failed: #{command}" unless $?.success? - end - - def parse_results - matches = {} - - Fasta.open(@file_patscan, mode='r') do |ios| - ios.each_entry do |entry| - if entry.seq_name =~ /^(\d+):\[(\d+),(\d+)\]$/ - name = $1.to_i - start = $2.to_i - 1 - stop = $3.to_i - 1 - matches[name] = [start, stop - start + 1] unless matches.has_key? name - else - raise "Failed to parse sequence name: #{entry.seq_name}" - end - end - end +require 'maasha/seq' +require 'maasha/seq/backtrack' - matches - end +def percent2real(length, percent) + (length * percent * 0.01).round end -# Error class for Pattern errors. -class PatternError < StandardError; end; +include BackTrack -class Pattern - def initialize(options) - @options = options - @patterns = [] - @patterns << pattern_internal - @patterns += patterns_end if @options[:trim_end] - end - - def to_i - new_patterns = [] - - while @patterns.size > 1 - new_patterns = @patterns[0 ... -2] - new_patterns << "( #{@patterns[-2 .. -1].join(' | ')} )" +casts = [] +casts << {:long=>'forward', :short=>'f', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'forward_rc', :short=>'F', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'reverse', :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'reverse_rc', :short=>'R', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'len_forward', :short=>'l', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'} +casts << {:long=>'len_reverse', :short=>'L', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'} +casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'insertions', :short=>'i', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'deletions', :short=>'d', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} - @patterns = new_patterns - end +options = Biopieces.options_parse(ARGV, casts) - @patterns.first - end +if options[:forward_rc] + options[:forward] = Seq.new("test", options[:forward_rc], 'dna').revcomp.seq +end - def write(file) - File.open(file, mode='w') do |ios| - ios.puts self.to_i - end - end +if options[:reverse_rc] + options[:reverse] = Seq.new("test", options[:reverse_rc], 'dna').revcomp.seq +end - private +raise ArgumentError, "no adaptor specified" unless options[:forward] or options[:reverse] - def pattern_internal - pattern = @options[:adaptor] - mis = mis_count(pattern) - ins = ins_count(pattern) - del = del_count(pattern) +if options[:forward] + options[:len_forward] = options[:forward].length unless options[:len_forward] - "#{pattern}[#{mis},#{ins},#{del}]" + if options[:len_forward] > options[:forward].length + raise ArgumentError, "len_forward > forward adaptor (#{options[:len_forward]} > #{options[:forward].length})" end - def patterns_end - patterns = [] - adaptor = @options[:adaptor] - - raise PatternError, "trim_end_min > adaptor length: #{@options[:trim_end_min]} > #{adaptor.length - 1}" if @options[:trim_end_min] > adaptor.length - 1 - - (adaptor.length - 1).downto(@options[:trim_end_min]) do |i| - pattern = adaptor[0 ... i] - mis = mis_count(pattern) - ins = ins_count(pattern) - del = del_count(pattern) - patterns << "#{pattern}[#{mis},#{ins},#{del}] $" - end + fmis = percent2real(options[:forward].length, options[:mismatches]) + fins = percent2real(options[:forward].length, options[:insertions]) + fdel = percent2real(options[:forward].length, options[:deletions]) +end - patterns - end +if options[:reverse] + options[:len_reverse] = options[:reverse].length unless options[:len_reverse] - def mis_count(pattern) - (pattern.length * @options[:mismatches] * 0.01).round + if options[:len_reverse] > options[:reverse].length + raise ArgumentError, "len_reverse > reverse adaptor (#{options[:len_reverse]} > #{options[:reverse].length})" end - def ins_count(pattern) - (pattern.length * @options[:insertions] * 0.01).round - end - - def del_count(pattern) - (pattern.length * @options[:deletions] * 0.01).round - end + rmis = percent2real(options[:reverse].length, options[:mismatches]) + rins = percent2real(options[:reverse].length, options[:insertions]) + rdel = percent2real(options[:reverse].length, options[:deletions]) end -casts = [] -casts << {:long=>'adaptor', :short=>'a', :type=>'string', :mandatory=>true, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'trim_end', :short=>'t', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'trim_end_min', :short=>'l', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>'0'} -casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'insertions', :short=>'i', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'deletions', :short=>'d', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} - -options = Biopieces.options_parse(ARGV, casts) - -tmpdir = Biopieces.mktmpdir -file_fasta = File.join(tmpdir, "data.fna") -file_records = File.join(tmpdir, "data.stream") -file_pattern = File.join(tmpdir, "pattern.txt") -file_patscan = File.join(tmpdir, "patscan.fna") +Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| + input.each do |record| + if record[:SEQ] and record[:SEQ].length > 0 + entry = Seq.new_bp(record) + + if options[:forward] + if m = entry.patmatch(options[:forward], 0, fmis, fins, fdel) + record[:ADAPTOR_POS_LEFT] = m.pos + record[:ADAPTOR_LEN_LEFT] = m.length + record[:ADAPTOR_PAT_LEFT] = m.match + elsif options[:len_forward] < options[:forward].length + len = options[:forward].length - 1 + pat = options[:forward] + + while len >= options[:len_forward] + fmis = percent2real(len, options[:mismatches]) + fins = percent2real(len, options[:insertions]) + fdel = percent2real(len, options[:deletions]) + + pat = pat[1 ... pat.length] + + if m = entry.patmatch(pat, [0, len], fmis, fins, fdel) + record[:ADAPTOR_POS_LEFT] = m.pos + record[:ADAPTOR_LEN_LEFT] = m.length + record[:ADAPTOR_PAT_LEFT] = m.match + + break + end + + len -= 1 + end + end + end -count = 0 + if options[:reverse] + if m = entry.patmatch(options[:reverse], 0, rmis, rins, rdel) + record[:ADAPTOR_POS_RIGHT] = m.pos + record[:ADAPTOR_LEN_RIGHT] = m.length + record[:ADAPTOR_PAT_RIGHT] = m.match + elsif options[:len_reverse] < options[:reverse].length + len = options[:reverse].length - 1 + pat = options[:reverse] -Biopieces.open(options[:stream_in], file_records) do |input, output| - Fasta.open(file_fasta, mode='w') do |out_fa| - input.each do |record| - output.puts record + while len >= options[:len_reverse] + rmis = percent2real(len, options[:mismatches]) + rins = percent2real(len, options[:insertions]) + rdel = percent2real(len, options[:deletions]) - if record.has_key? :SEQ - record[:SEQ_NAME] = count - out_fa.puts record + pat = pat[0 ... pat.length - 1] - count += 1; - end - end - end -end - -patscan = PatScan.new(options, file_fasta, file_pattern, file_patscan) -patscan.run -matches = patscan.parse_results + if m = entry.patmatch(pat, entry.length - len, rmis, rins, rdel) + record[:ADAPTOR_POS_RIGHT] = m.pos + record[:ADAPTOR_LEN_RIGHT] = m.length + record[:ADAPTOR_PAT_RIGHT] = m.match -count = 0 + break + end -Biopieces.open(file_records, options[:stream_out]) do |input, output| - input.each_record do |record| - if record.has_key? :SEQ - if matches.has_key? count - record[:ADAPTOR_POS] = matches[count].first - record[:ADAPTOR_LEN] = matches[count].last + len -= 1 + end + end end - - count += 1; end output.puts record end end - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<