]> git.donarmstrong.com Git - biopieces.git/blobdiff - bp_bin/find_adaptor
refactoring of ruby code converting sequences types to symbols
[biopieces.git] / bp_bin / find_adaptor
index e1ac8f38011dede6819f7b0e638d30e2bc63443f..f2f593d1ce90f11087b04ca9d2abc8a58a40b264 100755 (executable)
@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 
-# Copyright (C) 2007-2011 Martin A. Hansen.
+# Copyright (C) 2007-2012 Martin A. Hansen.
 
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
 
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
+require 'pp'
+require 'maasha/biopieces'
+require 'maasha/seq'
+require 'maasha/seq/backtrack'
 
-require 'biopieces'
-require 'seq'
-
-def disambiguate(adaptor)
-  adaptor_disamb = adaptor.dup
-  adaptor_disamb.gsub!('U', 'T')
-  adaptor_disamb.gsub!('R', '[AG]')
-  adaptor_disamb.gsub!('Y', '[CT]')
-  adaptor_disamb.gsub!('S', '[GC]')
-  adaptor_disamb.gsub!('W', '[AT]')
-  adaptor_disamb.gsub!('M', '[AC]')
-  adaptor_disamb.gsub!('K', '[GT]')
-  adaptor_disamb.gsub!('V', '[ACG]')
-  adaptor_disamb.gsub!('H', '[ACT]')
-  adaptor_disamb.gsub!('D', '[AGT]')
-  adaptor_disamb.gsub!('B', '[CGT]')
-  adaptor_disamb.gsub!('N', '.')
-  adaptor_disamb
+def percent2real(length, percent)
+  (length * percent * 0.01).round
 end
 
-class Seq
-  # Method that finds an adaptor or part thereof in the sequence of a Seq object.
-  # Returns a Match object if the adaptor was found otherwise nil. The ed_percent
-  # indicates the maximum edit distance allowed in all possible overlaps.
-  def adaptor_find(adaptor, adaptor_disamb, pos = 0, ed_percent = 0, dist = 0)
-    raise SeqError, "Edit distance percent out of range #{ed_percent}" unless (0 .. 100).include? ed_percent
+include BackTrack
 
-    if pos < 0
-      pos = self.length + pos # pos offset from the right end
-    end
-
-    if pos < self.length - dist + 1
-      if match = adaptor_find_simple(adaptor_disamb, pos)
-        return match
-      elsif match = adaptor_find_complex(adaptor, pos, ed_percent)
-        return match
-      elsif match = adaptor_partial_find_complex(adaptor, pos, ed_percent, dist)
-        return match
-      end
-    end
-  end
-
-  private
+casts = []
+casts << {:long=>'forward',     :short=>'f', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'forward_rc',  :short=>'F', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'reverse',     :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'reverse_rc',  :short=>'R', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'len_forward', :short=>'l', :type=>'uint',   :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'}
+casts << {:long=>'len_reverse', :short=>'L', :type=>'uint',   :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'}
+casts << {:long=>'mismatches',  :short=>'m', :type=>'uint',   :mandatory=>false, :default=>10,  :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'insertions',  :short=>'i', :type=>'uint',   :mandatory=>false, :default=>5,   :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'deletions',   :short=>'d', :type=>'uint',   :mandatory=>false, :default=>5,   :allowed=>nil, :disallowed=>nil}
+
+options = Biopieces.options_parse(ARGV, casts)
+
+if options[:forward_rc]
+  options[:forward] = Seq.new("test", options[:forward_rc], :dna).reverse.complement.seq
+end
 
-  # Method to find an adaptor in a sequence taking into account ambiguity
-  # codes, but not considering mismatches, insertions, and deletions.
-  def adaptor_find_simple(adaptor, pos)
-    self.seq.upcase.match(adaptor, pos) do |m|
-      return Match.new($`.length, m, m.to_s.length, 0, 0, 0, m.to_s.length)
-    end
-  end
+if options[:reverse_rc]
+  options[:reverse] = Seq.new("test", options[:reverse_rc], :dna).reverse.complement.seq
+end
 
-  # Method to find an adaptor in a sequence taking into account ambiguity
-  # codes, mismatches, insertions, and deletions.
-  def adaptor_find_complex(adaptor, pos, ed_percent)
-    ed_max = (adaptor.length * ed_percent * 0.01).round
+raise ArgumentError, "no adaptor specified" unless options[:forward] or options[:reverse]
 
-    match = self.match(adaptor, pos, ed_max)
+if options[:forward]
+  options[:len_forward] = options[:forward].length unless options[:len_forward]
 
-    match
+  if options[:len_forward] > options[:forward].length
+    raise ArgumentError, "len_forward > forward adaptor (#{options[:len_forward]} > #{options[:forward].length})" 
   end
 
-  # Method to find part of an adaptor at the right end of a sequence taking
-  # into account ambiguity codes, mismatches, insertions, and deletions.
-  def adaptor_partial_find_complex(adaptor, pos, ed_percent, dist)
-    if pos > self.length - adaptor.length
-      adaptor = adaptor[0 ... self.length - pos]
-    else
-      adaptor = adaptor[0 ... adaptor.length]
-
-      pos = self.length - adaptor.length
-    end
+  fmis = percent2real(options[:forward].length, options[:mismatches])
+  fins = percent2real(options[:forward].length, options[:insertions])
+  fdel = percent2real(options[:forward].length, options[:deletions])
+end
 
-    # puts self.seq
+if options[:reverse]
+  options[:len_reverse] = options[:reverse].length unless options[:len_reverse]
 
-    while adaptor.length > 0 and adaptor.length >= dist
-      # puts (" " * pos) + adaptor
+  if options[:len_reverse] > options[:reverse].length
+    raise ArgumentError, "len_reverse > reverse adaptor (#{options[:len_reverse]} > #{options[:reverse].length})"
+  end
 
-      ed_max = (adaptor.length * ed_percent * 0.01).round
+  rmis = percent2real(options[:reverse].length, options[:mismatches])
+  rins = percent2real(options[:reverse].length, options[:insertions])
+  rdel = percent2real(options[:reverse].length, options[:deletions])
+end
 
-      if ed_max == 0
-        self.seq.upcase.match(adaptor, pos) do |m|
-          return Match.new($`.length, m, m.to_s.length, 0, 0, 0, m.to_s.length)
-        end
-      else
-        self.scan(adaptor, pos, ed_max).each do |match|
-          return match
+Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
+  input.each do |record|
+    if record[:SEQ]
+      entry = Seq.new_bp(record)
+
+      if options[:forward] and record[:SEQ].length >= options[:forward].length
+        if m = entry.patmatch(options[:forward], 0, fmis, fins, fdel)
+          record[:ADAPTOR_POS_LEFT] = m.pos
+          record[:ADAPTOR_LEN_LEFT] = m.length
+          record[:ADAPTOR_PAT_LEFT] = m.match
+        elsif options[:len_forward] < options[:forward].length
+          len = options[:forward].length - 1
+          pat = options[:forward]
+
+          while len >= options[:len_forward]
+            fmis = percent2real(len, options[:mismatches])
+            fins = percent2real(len, options[:insertions])
+            fdel = percent2real(len, options[:deletions])
+
+            pat = pat[1 ... pat.length]
+
+            if m = entry.patmatch(pat, [0, len], fmis, fins, fdel)
+              record[:ADAPTOR_POS_LEFT] = m.pos
+              record[:ADAPTOR_LEN_LEFT] = m.length
+              record[:ADAPTOR_PAT_LEFT] = m.match
+
+              break
+            end
+
+            len -= 1
+          end
         end
       end
 
-      adaptor = adaptor[0 ... -1]
-
-      pos += 1
-    end
-  end
-end
-
-casts = []
-casts << {:long=>'adaptor',       :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'edit_distance', :short=>'e', :type=>'uint',   :mandatory=>false, :default=>20,  :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'pos',           :short=>'p', :type=>'int',    :mandatory=>false, :default=>1,   :allowed=>nil, :disallowed=>"0"}
-casts << {:long=>'dist',          :short=>'d', :type=>'uint',   :mandatory=>false, :default=>0,   :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'cache',         :short=>'c', :type=>'flag',   :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+      if options[:reverse] and record[:SEQ].length >= options[:reverse].length
+        if m = entry.patmatch(options[:reverse], 0, rmis, rins, rdel)
+          record[:ADAPTOR_POS_RIGHT] = m.pos
+          record[:ADAPTOR_LEN_RIGHT] = m.length
+          record[:ADAPTOR_PAT_RIGHT] = m.match
+        elsif options[:len_reverse] < options[:reverse].length
+          len = options[:reverse].length - 1
+          pat = options[:reverse]
 
-bp = Biopieces.new
+          while len >= options[:len_reverse]
+            rmis = percent2real(len, options[:mismatches])
+            rins = percent2real(len, options[:insertions])
+            rdel = percent2real(len, options[:deletions])
 
-options = bp.parse(ARGV, casts)
+            pat = pat[0 ... pat.length - 1]
 
-adaptor        = options[:adaptor].to_s.upcase
-adaptor_disamb = disambiguate(adaptor)
+            if m = entry.patmatch(pat, entry.length - len, rmis, rins, rdel)
+              record[:ADAPTOR_POS_RIGHT] = m.pos
+              record[:ADAPTOR_LEN_RIGHT] = m.length
+              record[:ADAPTOR_PAT_RIGHT] = m.match
 
-pos  = options[:pos]
-pos -= 1 if pos > 0  # pos was 1-based
+              break
+            end
 
-cache = {}
-
-bp.each_record do |record|
-  if record.has_key? :SEQ
-    entry = Seq.new(record[:SEQ_NAME], record[:SEQ], "dna", record[:SCORES])
-
-    if cache[entry.seq.upcase.to_sym] and options[:cache]
-      match = cache[entry.seq.upcase]
-    else
-      match = entry.adaptor_find(adaptor, adaptor_disamb, pos, options[:edit_distance], options[:dist])
-
-      cache[entry.seq.upcase.to_sym] = match if match and options[:cache]
+            len -= 1
+          end
+        end
+      end
     end
 
-    if match
-      record[:ADAPTOR_POS]   = match.pos
-      record[:ADAPTOR_LEN]   = match.length
-      record[:ADAPTOR_MATCH] = match.match
-    end
+    output.puts record
   end
-
-  bp.puts record
 end
 
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<