X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bp_bin%2Ffind_adaptor;h=2fd757aa09dfcf39e02be9037e5b9b1f372c6b6b;hb=5de6112b70b59420b245ce636a8b2e3c90acbe00;hp=aa1a0146ef7c1a16fff614810b6b474defda2be6;hpb=494dc53ebd515b1e3e9b91bbebf43059899ca4ce;p=biopieces.git diff --git a/bp_bin/find_adaptor b/bp_bin/find_adaptor index aa1a014..2fd757a 100755 --- a/bp_bin/find_adaptor +++ b/bp_bin/find_adaptor @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -# Copyright (C) 2007-2011 Martin A. Hansen. +# Copyright (C) 2007-2012 Martin A. Hansen. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -28,143 +28,130 @@ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - +require 'pp' require 'maasha/biopieces' require 'maasha/seq' +require 'maasha/seq/backtrack' -def disambiguate(adaptor) - adaptor_disamb = adaptor.dup - adaptor_disamb.gsub!('U', 'T') - adaptor_disamb.gsub!('R', '[AG]') - adaptor_disamb.gsub!('Y', '[CT]') - adaptor_disamb.gsub!('S', '[GC]') - adaptor_disamb.gsub!('W', '[AT]') - adaptor_disamb.gsub!('M', '[AC]') - adaptor_disamb.gsub!('K', '[GT]') - adaptor_disamb.gsub!('V', '[ACG]') - adaptor_disamb.gsub!('H', '[ACT]') - adaptor_disamb.gsub!('D', '[AGT]') - adaptor_disamb.gsub!('B', '[CGT]') - adaptor_disamb.gsub!('N', '.') - adaptor_disamb +def percent2real(length, percent) + (length * percent * 0.01).round end -class Seq - # Method that finds an adaptor or part thereof in the sequence of a Seq object. - # Returns a Match object if the adaptor was found otherwise nil. The ed_percent - # indicates the maximum edit distance allowed in all possible overlaps. - def adaptor_find(adaptor, adaptor_disamb, pos = 0, ed_percent = 0, dist = 0) - raise SeqError, "Edit distance percent out of range #{ed_percent}" unless (0 .. 100).include? ed_percent - - if pos < 0 - pos = self.length + pos # pos offset from the right end - end - - if pos < self.length - dist + 1 - if match = adaptor_find_simple(adaptor_disamb, pos) - return match - elsif match = adaptor_find_complex(adaptor, pos, ed_percent) - return match - elsif match = adaptor_partial_find_complex(adaptor, pos, ed_percent, dist) - return match - end - end - end +include BackTrack - private +casts = [] +casts << {:long=>'forward', :short=>'f', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'forward_rc', :short=>'F', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'reverse', :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'reverse_rc', :short=>'R', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'len_forward', :short=>'l', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'} +casts << {:long=>'len_reverse', :short=>'L', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'} +casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'insertions', :short=>'i', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'deletions', :short=>'d', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} + +options = Biopieces.options_parse(ARGV, casts) + +if options[:forward_rc] + options[:forward] = Seq.new(seq: options[:forward_rc], type: :dna).reverse.complement.seq +end - # Method to find an adaptor in a sequence taking into account ambiguity - # codes, but not considering mismatches, insertions, and deletions. - def adaptor_find_simple(adaptor, pos) - self.seq.upcase.match(adaptor, pos) do |m| - return Match.new($`.length, m, m.to_s.length, 0, 0, 0, m.to_s.length) - end - end +if options[:reverse_rc] + options[:reverse] = Seq.new(seq: options[:reverse_rc], type: :dna).reverse.complement.seq +end - # Method to find an adaptor in a sequence taking into account ambiguity - # codes, mismatches, insertions, and deletions. - def adaptor_find_complex(adaptor, pos, ed_percent) - ed_max = (adaptor.length * ed_percent * 0.01).round +raise ArgumentError, "no adaptor specified" unless options[:forward] or options[:reverse] - match = self.match(adaptor, pos, ed_max) +if options[:forward] + options[:len_forward] = options[:forward].length unless options[:len_forward] - match + if options[:len_forward] > options[:forward].length + raise ArgumentError, "len_forward > forward adaptor (#{options[:len_forward]} > #{options[:forward].length})" end - # Method to find part of an adaptor at the right end of a sequence taking - # into account ambiguity codes, mismatches, insertions, and deletions. - def adaptor_partial_find_complex(adaptor, pos, ed_percent, dist) - if pos > self.length - adaptor.length - adaptor = adaptor[0 ... self.length - pos] - else - adaptor = adaptor[0 ... adaptor.length] - - pos = self.length - adaptor.length - end + fmis = percent2real(options[:forward].length, options[:mismatches]) + fins = percent2real(options[:forward].length, options[:insertions]) + fdel = percent2real(options[:forward].length, options[:deletions]) +end - # puts self.seq +if options[:reverse] + options[:len_reverse] = options[:reverse].length unless options[:len_reverse] - while adaptor.length > 0 and adaptor.length >= dist - # puts (" " * pos) + adaptor + if options[:len_reverse] > options[:reverse].length + raise ArgumentError, "len_reverse > reverse adaptor (#{options[:len_reverse]} > #{options[:reverse].length})" + end - ed_max = (adaptor.length * ed_percent * 0.01).round + rmis = percent2real(options[:reverse].length, options[:mismatches]) + rins = percent2real(options[:reverse].length, options[:insertions]) + rdel = percent2real(options[:reverse].length, options[:deletions]) +end - if ed_max == 0 - self.seq.upcase.match(adaptor, pos) do |m| - return Match.new($`.length, m, m.to_s.length, 0, 0, 0, m.to_s.length) - end - else - self.scan(adaptor, pos, ed_max).each do |match| - return match +Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| + input.each do |record| + if record[:SEQ] + entry = Seq.new_bp(record) + + if options[:forward] and record[:SEQ].length >= options[:forward].length + if m = entry.patmatch(options[:forward], max_mismatches: fmis, max_insertions: fins, max_deletions: fdel) + record[:ADAPTOR_POS_LEFT] = m.pos + record[:ADAPTOR_LEN_LEFT] = m.length + record[:ADAPTOR_PAT_LEFT] = m.match + elsif options[:len_forward] < options[:forward].length + len = options[:forward].length - 1 + pat = options[:forward] + + while len >= options[:len_forward] + fmis = percent2real(len, options[:mismatches]) + fins = percent2real(len, options[:insertions]) + fdel = percent2real(len, options[:deletions]) + + pat = pat[1 ... pat.length] + + if m = entry.patmatch(pat, start: 0, stop: len, max_mismatches: fmis, max_insertions: fins, max_deletions: fdel) + record[:ADAPTOR_POS_LEFT] = m.pos + record[:ADAPTOR_LEN_LEFT] = m.length + record[:ADAPTOR_PAT_LEFT] = m.match + + break + end + + len -= 1 + end end end - adaptor = adaptor[0 ... -1] - - pos += 1 - end - end -end - -casts = [] -casts << {:long=>'adaptor', :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'edit_distance', :short=>'e', :type=>'uint', :mandatory=>false, :default=>20, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'pos', :short=>'p', :type=>'int', :mandatory=>false, :default=>1, :allowed=>nil, :disallowed=>"0"} -casts << {:long=>'dist', :short=>'d', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'cache', :short=>'c', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} + if options[:reverse] and record[:SEQ].length >= options[:reverse].length + if m = entry.patmatch(options[:reverse], max_mismatches: rmis, max_insertions: rins, max_deletions: rdel) + record[:ADAPTOR_POS_RIGHT] = m.pos + record[:ADAPTOR_LEN_RIGHT] = m.length + record[:ADAPTOR_PAT_RIGHT] = m.match + elsif options[:len_reverse] < options[:reverse].length + len = options[:reverse].length - 1 + pat = options[:reverse] -bp = Biopieces.new + while len >= options[:len_reverse] + rmis = percent2real(len, options[:mismatches]) + rins = percent2real(len, options[:insertions]) + rdel = percent2real(len, options[:deletions]) -options = bp.parse(ARGV, casts) + pat = pat[0 ... pat.length - 1] -adaptor = options[:adaptor].to_s.upcase -adaptor_disamb = disambiguate(adaptor) + if m = entry.patmatch(pat, start: entry.length - len, max_mismatches: rmis, max_insertions: rins, max_deletions: rdel) + record[:ADAPTOR_POS_RIGHT] = m.pos + record[:ADAPTOR_LEN_RIGHT] = m.length + record[:ADAPTOR_PAT_RIGHT] = m.match -pos = options[:pos] -pos -= 1 if pos > 0 # pos was 1-based + break + end -cache = {} - -bp.each_record do |record| - if record.has_key? :SEQ - entry = Seq.new(record[:SEQ_NAME], record[:SEQ], "dna", record[:SCORES]) - - if cache[entry.seq.upcase.to_sym] and options[:cache] - match = cache[entry.seq.upcase] - else - match = entry.adaptor_find(adaptor, adaptor_disamb, pos, options[:edit_distance], options[:dist]) - - cache[entry.seq.upcase.to_sym] = match if match and options[:cache] + len -= 1 + end + end + end end - if match - record[:ADAPTOR_POS] = match.pos - record[:ADAPTOR_LEN] = match.length - record[:ADAPTOR_MATCH] = match.match - end + output.puts record end - - bp.puts record end # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<