#!/usr/bin/env ruby
-# Copyright (C) 2007-2011 Martin A. Hansen.
+# Copyright (C) 2007-2012 Martin A. Hansen.
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+require 'pp'
+require 'maasha/biopieces'
+require 'maasha/seq'
+require 'maasha/seq/backtrack'
-require 'biopieces'
-require 'seq'
-
-def disambiguate(adaptor)
- adaptor_disamb = adaptor.dup
- adaptor_disamb.gsub!('U', 'T')
- adaptor_disamb.gsub!('R', '[AG]')
- adaptor_disamb.gsub!('Y', '[CT]')
- adaptor_disamb.gsub!('S', '[GC]')
- adaptor_disamb.gsub!('W', '[AT]')
- adaptor_disamb.gsub!('M', '[AC]')
- adaptor_disamb.gsub!('K', '[GT]')
- adaptor_disamb.gsub!('V', '[ACG]')
- adaptor_disamb.gsub!('H', '[ACT]')
- adaptor_disamb.gsub!('D', '[AGT]')
- adaptor_disamb.gsub!('B', '[CGT]')
- adaptor_disamb.gsub!('N', '.')
- adaptor_disamb
+def percent2real(length, percent)
+ (length * percent * 0.01).round
end
-class Seq
- # Method that finds an adaptor or part thereof in the sequence of a Seq object.
- # Returns a Match object if the adaptor was found otherwise nil. The ed_percent
- # indicates the maximum edit distance allowed in all possible overlaps.
- def adaptor_find(adaptor, adaptor_disamb, pos = 0, ed_percent = 0)
- raise SeqError, "Edit distance percent out of range #{ed_percent}" unless (0 .. 100).include? ed_percent
+include BackTrack
- if pos < 0
- pos = self.length - pos # pos offset from the right end
- end
+casts = []
+casts << {:long=>'forward', :short=>'f', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'forward_rc', :short=>'F', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'reverse', :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'reverse_rc', :short=>'R', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'len_forward', :short=>'l', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'}
+casts << {:long=>'len_reverse', :short=>'L', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'}
+casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'insertions', :short=>'i', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'deletions', :short=>'d', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil}
+
+options = Biopieces.options_parse(ARGV, casts)
+
+if options[:forward_rc]
+ options[:forward] = Seq.new("test", options[:forward_rc], :dna).reverse.complement.seq
+end
- if match = adaptor_find_simple(adaptor_disamb, pos)
- return match
- elsif match = adaptor_find_complex(adaptor, pos, ed_percent)
- return match
- elsif match = adaptor_partial_find_complex(adaptor, ed_percent)
- return match
- end
- end
+if options[:reverse_rc]
+ options[:reverse] = Seq.new("test", options[:reverse_rc], :dna).reverse.complement.seq
+end
- private
+raise ArgumentError, "no adaptor specified" unless options[:forward] or options[:reverse]
- # Method to find an adaptor in a sequence taking into account ambiguity
- # codes, but not considering mismatches, insertions, and deletions.
- def adaptor_find_simple(adaptor, pos)
- self.seq.upcase.match(adaptor, pos) do |m|
- return Match.new($`.length, m, m.to_s.length, 0, 0, 0, m.to_s.length)
- end
+if options[:forward]
+ options[:len_forward] = options[:forward].length unless options[:len_forward]
+
+ if options[:len_forward] > options[:forward].length
+ raise ArgumentError, "len_forward > forward adaptor (#{options[:len_forward]} > #{options[:forward].length})"
end
- # Method to find an adaptor in a sequence taking into account ambiguity
- # codes, mismatches, insertions, and deletions.
- def adaptor_find_complex(adaptor, pos, ed_percent)
- ed_max = (adaptor.length * ed_percent * 0.01).round
+ fmis = percent2real(options[:forward].length, options[:mismatches])
+ fins = percent2real(options[:forward].length, options[:insertions])
+ fdel = percent2real(options[:forward].length, options[:deletions])
+end
- match = self.match(adaptor, pos, ed_max)
+if options[:reverse]
+ options[:len_reverse] = options[:reverse].length unless options[:len_reverse]
- match
+ if options[:len_reverse] > options[:reverse].length
+ raise ArgumentError, "len_reverse > reverse adaptor (#{options[:len_reverse]} > #{options[:reverse].length})"
end
- # Method to find part of an adaptor at the right end of a sequence taking
- # into account ambiguity codes, mismatches, insertions, and deletions.
- def adaptor_partial_find_complex(adaptor, ed_percent)
- adaptor = adaptor[0 ... -1]
-
- pos = self.len - adaptor.length
-
- while adaptor.length > 0
- ed_max = (adaptor.length * ed_percent * 0.01).round
+ rmis = percent2real(options[:reverse].length, options[:mismatches])
+ rins = percent2real(options[:reverse].length, options[:insertions])
+ rdel = percent2real(options[:reverse].length, options[:deletions])
+end
- if ed_max == 0
- self.seq.upcase.match(adaptor, pos) do |m|
- return Match.new($`.length, m, m.to_s.length, 0, 0, 0, m.to_s.length)
- end
- else
- self.scan(adaptor, pos, ed_max).each do |match|
- return match
+Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
+ input.each do |record|
+ if record[:SEQ]
+ entry = Seq.new_bp(record)
+
+ if options[:forward] and record[:SEQ].length >= options[:forward].length
+ if m = entry.patmatch(options[:forward], 0, fmis, fins, fdel)
+ record[:ADAPTOR_POS_LEFT] = m.pos
+ record[:ADAPTOR_LEN_LEFT] = m.length
+ record[:ADAPTOR_PAT_LEFT] = m.match
+ elsif options[:len_forward] < options[:forward].length
+ len = options[:forward].length - 1
+ pat = options[:forward]
+
+ while len >= options[:len_forward]
+ fmis = percent2real(len, options[:mismatches])
+ fins = percent2real(len, options[:insertions])
+ fdel = percent2real(len, options[:deletions])
+
+ pat = pat[1 ... pat.length]
+
+ if m = entry.patmatch(pat, [0, len], fmis, fins, fdel)
+ record[:ADAPTOR_POS_LEFT] = m.pos
+ record[:ADAPTOR_LEN_LEFT] = m.length
+ record[:ADAPTOR_PAT_LEFT] = m.match
+
+ break
+ end
+
+ len -= 1
+ end
end
end
- adaptor = adaptor[0 ... -1]
+ if options[:reverse] and record[:SEQ].length >= options[:reverse].length
+ if m = entry.patmatch(options[:reverse], 0, rmis, rins, rdel)
+ record[:ADAPTOR_POS_RIGHT] = m.pos
+ record[:ADAPTOR_LEN_RIGHT] = m.length
+ record[:ADAPTOR_PAT_RIGHT] = m.match
+ elsif options[:len_reverse] < options[:reverse].length
+ len = options[:reverse].length - 1
+ pat = options[:reverse]
- pos = self.len - adaptor.length
- end
- end
-end
-
-casts = []
-casts << {:long=>'adaptor', :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'edit_distance', :short=>'e', :type=>'uint', :mandatory=>false, :default=>20, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'pos', :short=>'p', :type=>'int', :mandatory=>false, :default=>1, :allowed=>nil, :disallowed=>"0"}
-
-bp = Biopieces.new
+ while len >= options[:len_reverse]
+ rmis = percent2real(len, options[:mismatches])
+ rins = percent2real(len, options[:insertions])
+ rdel = percent2real(len, options[:deletions])
-options = bp.parse(ARGV, casts)
+ pat = pat[0 ... pat.length - 1]
-adaptor = options[:adaptor].to_s.upcase
-adaptor_disamb = disambiguate(adaptor)
+ if m = entry.patmatch(pat, entry.length - len, rmis, rins, rdel)
+ record[:ADAPTOR_POS_RIGHT] = m.pos
+ record[:ADAPTOR_LEN_RIGHT] = m.length
+ record[:ADAPTOR_PAT_RIGHT] = m.match
-pos = options[:pos]
-pos -= 1 if pos > 0 # pos was 1-based
+ break
+ end
-bp.each_record do |record|
- if record.has_key? :SEQ
- entry = Seq.new(record[:SEQ_NAME], record[:SEQ], "dna", record[:SCORES])
-
- if match = entry.adaptor_find(adaptor, adaptor_disamb, pos, options[:edit_distance])
- record[:ADAPTOR_POS] = match.pos
- record[:ADAPTOR_LEN] = match.length
- record[:ADAPTOR_MATCH] = match.match
+ len -= 1
+ end
+ end
+ end
end
- end
- bp.puts record
+ output.puts record
+ end
end
-
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<