]> git.donarmstrong.com Git - biopieces.git/blobdiff - bp_bin/find_adaptor
added order_pairs2
[biopieces.git] / bp_bin / find_adaptor
index f19648e064fba502d6cc474ead97c6222580dd91..bf4f372d12a0ad110100dd5ddc221d1bc208fb68 100755 (executable)
@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 
-# Copyright (C) 2007-2011 Martin A. Hansen.
+# Copyright (C) 2007-2012 Martin A. Hansen.
 
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
 
-require 'biopieces'
-require 'seq'
-
-def disambiguate(adaptor)
-  adaptor_disamb = adaptor.dup
-  adaptor_disamb.gsub!('U', 'T')
-  adaptor_disamb.gsub!('R', '[AG]')
-  adaptor_disamb.gsub!('Y', '[CT]')
-  adaptor_disamb.gsub!('S', '[GC]')
-  adaptor_disamb.gsub!('W', '[AT]')
-  adaptor_disamb.gsub!('M', '[AC]')
-  adaptor_disamb.gsub!('K', '[GT]')
-  adaptor_disamb.gsub!('V', '[ACG]')
-  adaptor_disamb.gsub!('H', '[ACT]')
-  adaptor_disamb.gsub!('D', '[AGT]')
-  adaptor_disamb.gsub!('B', '[CGT]')
-  adaptor_disamb.gsub!('N', '.')
-  adaptor_disamb
-end
+require 'pp'
+require 'maasha/biopieces'
+require 'maasha/fasta'
+
+# Error class for PatScan errors.
+class PatScanError < StandardError; end;
 
-class Seq
-  # Method that finds an adaptor or part thereof in the sequence of a Seq object.
-  # Returns a Match object if the adaptor was found otherwise nil. The ed_percent
-  # indicates the maximum edit distance allowed in all possible overlaps.
-  def adaptor_find(adaptor, adaptor_disamb, pos = 0, ed_percent = 0)
-    raise SeqError, "Edit distance percent out of range #{ed_percent}" unless (0 .. 100).include? ed_percent
+class PatScan
+  def initialize(tmpdir, cpus)
+               @tmpdir       = tmpdir
+    @cpus         = cpus
+    @files_fasta  = Dir.glob(File.join(@tmpdir, "*.fna"))
+  end
 
-    if pos < 0
-      pos = self.length - pos # pos offset from the right end
+  def run(pattern)
+    file_pat = File.join(@tmpdir, "pat.txt")
+
+    File.open(file_pat, 'w') do |ios|
+      ios.puts pattern
     end
 
-    match = adaptor_find_simple(adaptor_disamb, pos)          or
-            adaptor_find_complex(adaptor, pos, ed_percent)    or
-            adaptor_partial_find_complex(adaptor, ed_percent)
+    child_count = 0
+    ch_mutex    = Mutex.new
+    threads     = []
+
+    @files_fasta.each do |file_fasta|
+      Thread.pass while child_count >= @cpus
+      ch_mutex.synchronize { child_count += 1 }
 
-    match
+      threads << Thread.new do
+        command = command_compile(file_pat, file_fasta)
+        system(command)
+        raise PatScanError, "Command failed: #{command}" unless $?.success?
+        ch_mutex.synchronize { child_count -= 1 }
+      end
+    end
+
+    threads.each { |t| t.join }
   end
 
-  private
+  def parse_results
+    files_result = Dir.glob(File.join(@tmpdir, "*.out"))
+
+    matches = {}
+
+    files_result.each do |file|
+      Fasta.open(file, 'r') do |ios|
+        ios.each do |entry|
+          if entry.seq_name =~ /^(\d+):\[(\d+),(\d+)\]$/
+            name  = $1.to_i
+            start = $2.to_i - 1
+            stop  = $3.to_i - 1
+            matches[name] = [start, stop - start + 1] unless matches[name]
+          else
+            raise "Failed to parse sequence name: #{entry.seq_name}"
+          end
+        end
+      end
+    end
+
+    matches
+  end
 
-  # Method to find an adaptor in a sequence taking into account ambiguity
-  # codes, but not considering mismatches, insertions, and deletions.
-  def adaptor_find_simple(adaptor, pos)
-    self.seq.upcase.match(adaptor, pos) do |m|
-      return Match.new($`.length, m, m.to_s.length, 0, 0, 0, m.to_s.length)
+  def clean
+    Dir.glob(File.join(@tmpdir, "*.out")).each do |file|
+      File.unlink(file)
     end
   end
 
-  # Method to find an adaptor in a sequence taking into account ambiguity
-  # codes, mismatches, insertions, and deletions.
-  def adaptor_find_complex(adaptor, pos, ed_percent)
-    ed_max = (adaptor.length * ed_percent * 0.01).round
+  private
 
-    self.scan(adaptor, pos, ed_max).each do |match|
-      return match
-    end
+  def command_compile(file_pat, file_fasta)
+    commands = []
+    commands << "nice -n 19"
+    commands << "scan_for_matches"
+    commands << file_pat
+    commands << "< #{file_fasta}"
+    commands << "> #{file_fasta}.out"
+    command = commands.join(" ")
   end
+end
 
-  # Method to find part of an adaptor at the right end of a sequence taking
-  # into account ambiguity codes, mismatches, insertions, and deletions.
-  def adaptor_partial_find_complex(adaptor, ed_percent)
-    adaptor = adaptor[0 ... -1]
+# Error class for Pattern errors.
+class PatternError < StandardError; end;
 
-    pos = self.len - adaptor.length
+class Pattern
+  def self.create_left(adaptor, misp, insp, delp, len)
+    patterns = []
 
-    while adaptor.length > 0
-      ed_max = (adaptor.length * ed_percent * 0.01).round
+    (adaptor.length).downto(len) do |i|
+      pattern = adaptor[adaptor.length - i ... adaptor.length]
+      mis     = (pattern.length * misp * 0.01).round
+      ins     = (pattern.length * insp * 0.01).round
+      del     = (pattern.length * delp * 0.01).round
 
-      if ed_max == 0
-        self.seq.upcase.match(adaptor, pos) do |m|
-          return Match.new($`.length, m, m.to_s.length, 0, 0, 0, m.to_s.length)
-        end
+      if i == adaptor.length
+        patterns << "#{pattern}[#{mis},#{ins},#{del}]"
       else
-        self.scan(adaptor, pos, ed_max).each do |match|
-          return match
-        end
+        patterns << "^ #{pattern}[#{mis},#{ins},#{del}]"
+      end
+    end
+
+    compile(patterns)
+  end
+
+  def self.create_right(adaptor, misp, insp, delp, len)
+    patterns = []
+
+    (adaptor.length).downto(len) do |i|
+      pattern = adaptor[0 ... i]
+      mis     = (pattern.length * misp * 0.01).round
+      ins     = (pattern.length * insp * 0.01).round
+      del     = (pattern.length * delp * 0.01).round
+
+      if i == adaptor.length
+        patterns << "#{pattern}[#{mis},#{ins},#{del}]"
+      else
+        patterns << "#{pattern}[#{mis},#{ins},#{del}] $"
       end
+    end
+
+    compile(patterns)
+  end
+
+  private
+
+  def self.compile(patterns)
+    new_patterns = []
 
-      adaptor = adaptor[0 ... -1]
+    while patterns.size > 1
+      new_patterns = patterns[0 ... -2] 
+      new_patterns << "( #{patterns[-2 .. -1].join(' | ')} )"
 
-      pos = self.len - adaptor.length
+      patterns = new_patterns
     end
+
+    patterns.first
   end
 end
 
 casts = []
-casts << {:long=>'adaptor',       :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'edit_distance', :short=>'e', :type=>'uint',   :mandatory=>false, :default=>20,  :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'pos',           :short=>'p', :type=>'int',    :mandatory=>false, :default=>1,   :allowed=>nil, :disallowed=>"0"}
+casts << {:long=>'forward',       :short=>'f', :type=>'string', :mandatory=>false, :default=>nil,        :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'forward_rc',    :short=>'F', :type=>'string', :mandatory=>false, :default=>nil,        :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'reverse',       :short=>'r', :type=>'string', :mandatory=>false, :default=>nil,        :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'reverse_rc',    :short=>'R', :type=>'string', :mandatory=>false, :default=>nil,        :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'len_forward',   :short=>'l', :type=>'uint',   :mandatory=>false, :default=>nil,        :allowed=>nil, :disallowed=>'0'}
+casts << {:long=>'len_reverse',   :short=>'L', :type=>'uint',   :mandatory=>false, :default=>nil,        :allowed=>nil, :disallowed=>'0'}
+casts << {:long=>'mismatches',    :short=>'m', :type=>'uint',   :mandatory=>false, :default=>10,         :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'insertions',    :short=>'i', :type=>'uint',   :mandatory=>false, :default=>5,          :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'deletions',     :short=>'d', :type=>'uint',   :mandatory=>false, :default=>5,          :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'cpus',          :short=>'c', :type=>'uint',   :mandatory=>false, :default=>1,          :allowed=>nil, :disallowed=>'0'}
+casts << {:long=>'bases_per_file',:short=>'b', :type=>'uint',   :mandatory=>false, :default=>10_000_000, :allowed=>nil, :disallowed=>'0'}
+
+options = Biopieces.options_parse(ARGV, casts)
+
+if options[:forward_rc]
+  options[:forward] = Seq.new("test", options[:forward_rc], 'dna').revcomp.seq
+end
 
-bp = Biopieces.new
+if options[:reverse_rc]
+  options[:reverse] = Seq.new("test", options[:reverse_rc], 'dna').revcomp.seq
+end
 
-options = bp.parse(ARGV, casts)
+raise ArgumentError, "no adaptor specified" unless options[:forward] or options[:reverse]
 
-adaptor        = options[:adaptor].to_s.upcase
-adaptor_disamb = disambiguate(adaptor)
+if options[:forward]
+  options[:len_forward] = options[:forward].length unless options[:len_forward]
 
-pos  = options[:pos]
-pos -= 1 if pos > 0  # pos was 1-based
+  if options[:len_forward] > options[:forward].length
+    raise ArgumentError, "len_forward > forward adaptor (#{options[:len_forward]} > #{options[:forward].length})" 
+  end
+end
+
+if options[:reverse]
+  options[:len_reverse] = options[:reverse].length unless options[:len_reverse]
+
+  if options[:len_reverse] > options[:reverse].length
+    raise ArgumentError, "len_reverse > reverse adaptor (#{options[:len_reverse]} > #{options[:reverse].length})"
+  end
+end
 
-bp.each_record do |record|
-  if record.has_key? :SEQ
-    entry = Seq.new(record[:SEQ_NAME], record[:SEQ], "dna", record[:SCORES])
+tmpdir       = Biopieces.mktmpdir
+file_records = File.join(tmpdir, "data.stream")
 
-    if match = entry.adaptor_find(adaptor, adaptor_disamb, options[:pos], options[:edit_distance])
-      record[:ADAPTOR_POS]   = match.pos
-      record[:ADAPTOR_LEN]   = match.length
-      record[:ADAPTOR_MATCH] = match.match
+count_file = 0
+count_seq  = 0
+bases      = 0
+
+Biopieces.open(options[:stream_in], file_records) do |input, output|
+  file_fasta = File.join(tmpdir, "#{count_file}.fna")
+  out_fa     = Fasta.open(file_fasta, 'w')
+
+  input.each do |record|
+    output.puts record
+
+    if record[:SEQ] and record[:SEQ].length > 0
+      record[:SEQ_NAME] = count_seq
+
+      seq = Seq.new_bp(record)
+
+      out_fa.puts seq.to_fasta
+
+      count_seq += 1;
+      bases      += record[:SEQ].length
+
+      if bases > options[:bases_per_file]
+        out_fa.close
+        bases       = 0
+        count_file += 1
+        file_fasta  = File.join(tmpdir, "#{count_file}.fna")
+        out_fa      = Fasta.open(file_fasta, 'w')
+      end
     end
   end
 
-  bp.puts record
+  out_fa.close if out_fa.respond_to? :close
+end
+
+patscan       = PatScan.new(tmpdir, options[:cpus])
+matches_left  = {}
+matches_right = {}
+
+if options[:forward]
+  pat_left = Pattern.create_left( 
+    options[:forward],
+    options[:mismatches],
+    options[:insertions],
+    options[:deletions],
+    options[:len_forward])
+
+  patscan.run(pat_left)
+  matches_left = patscan.parse_results
+end
+
+if options[:reverse]
+  patscan.clean
+  pat_right = Pattern.create_right(
+    options[:reverse],
+    options[:mismatches],
+    options[:insertions],
+    options[:deletions],
+    options[:len_reverse])
+
+  patscan.run(pat_right)
+  matches_right = patscan.parse_results
+end
+
+count_seq = 0
+
+Biopieces.open(file_records, options[:stream_out]) do |input, output|
+  input.each_record do |record|
+    if record[:SEQ]
+      if matches_left[count_seq]
+        record[:ADAPTOR_POS_LEFT] = matches_left[count_seq].first
+        record[:ADAPTOR_LEN_LEFT] = matches_left[count_seq].last
+        record[:ADAPTOR_PAT_LEFT] = record[:SEQ][record[:ADAPTOR_POS_LEFT] ... record[:ADAPTOR_POS_LEFT] + record[:ADAPTOR_LEN_LEFT]]
+      end
+
+      if matches_right[count_seq]
+        record[:ADAPTOR_POS_RIGHT] = matches_right[count_seq].first
+        record[:ADAPTOR_LEN_RIGHT] = matches_right[count_seq].last
+        record[:ADAPTOR_PAT_RIGHT] = record[:SEQ][record[:ADAPTOR_POS_RIGHT] ... record[:ADAPTOR_POS_RIGHT] + record[:ADAPTOR_LEN_RIGHT]]
+      end
+
+      count_seq += 1;
+    end
+
+    output.puts record
+  end
 end