]> git.donarmstrong.com Git - biopieces.git/commitdiff
unrolling new find_adaptor
authormartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Mon, 10 Dec 2012 18:34:35 +0000 (18:34 +0000)
committermartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Mon, 10 Dec 2012 18:34:35 +0000 (18:34 +0000)
git-svn-id: http://biopieces.googlecode.com/svn/trunk@2039 74ccb610-7750-0410-82ae-013aeee3265d

bp_bin/find_adaptor

index bf4f372d12a0ad110100dd5ddc221d1bc208fb68..5b2c96b76403428a3ae776d81cbbdda87587e046 100755 (executable)
 
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
-
 require 'pp'
 require 'maasha/biopieces'
-require 'maasha/fasta'
-
-# Error class for PatScan errors.
-class PatScanError < StandardError; end;
-
-class PatScan
-  def initialize(tmpdir, cpus)
-               @tmpdir       = tmpdir
-    @cpus         = cpus
-    @files_fasta  = Dir.glob(File.join(@tmpdir, "*.fna"))
-  end
-
-  def run(pattern)
-    file_pat = File.join(@tmpdir, "pat.txt")
-
-    File.open(file_pat, 'w') do |ios|
-      ios.puts pattern
-    end
-
-    child_count = 0
-    ch_mutex    = Mutex.new
-    threads     = []
-
-    @files_fasta.each do |file_fasta|
-      Thread.pass while child_count >= @cpus
-      ch_mutex.synchronize { child_count += 1 }
-
-      threads << Thread.new do
-        command = command_compile(file_pat, file_fasta)
-        system(command)
-        raise PatScanError, "Command failed: #{command}" unless $?.success?
-        ch_mutex.synchronize { child_count -= 1 }
-      end
-    end
-
-    threads.each { |t| t.join }
-  end
-
-  def parse_results
-    files_result = Dir.glob(File.join(@tmpdir, "*.out"))
-
-    matches = {}
-
-    files_result.each do |file|
-      Fasta.open(file, 'r') do |ios|
-        ios.each do |entry|
-          if entry.seq_name =~ /^(\d+):\[(\d+),(\d+)\]$/
-            name  = $1.to_i
-            start = $2.to_i - 1
-            stop  = $3.to_i - 1
-            matches[name] = [start, stop - start + 1] unless matches[name]
-          else
-            raise "Failed to parse sequence name: #{entry.seq_name}"
-          end
-        end
-      end
-    end
+require 'maasha/seq'
+require 'maasha/seq/backtrack'
 
-    matches
-  end
-
-  def clean
-    Dir.glob(File.join(@tmpdir, "*.out")).each do |file|
-      File.unlink(file)
-    end
-  end
-
-  private
-
-  def command_compile(file_pat, file_fasta)
-    commands = []
-    commands << "nice -n 19"
-    commands << "scan_for_matches"
-    commands << file_pat
-    commands << "< #{file_fasta}"
-    commands << "> #{file_fasta}.out"
-    command = commands.join(" ")
-  end
+def percent2real(length, percent)
+  (length * percent * 0.01).round
 end
 
-# Error class for Pattern errors.
-class PatternError < StandardError; end;
-
-class Pattern
-  def self.create_left(adaptor, misp, insp, delp, len)
-    patterns = []
-
-    (adaptor.length).downto(len) do |i|
-      pattern = adaptor[adaptor.length - i ... adaptor.length]
-      mis     = (pattern.length * misp * 0.01).round
-      ins     = (pattern.length * insp * 0.01).round
-      del     = (pattern.length * delp * 0.01).round
-
-      if i == adaptor.length
-        patterns << "#{pattern}[#{mis},#{ins},#{del}]"
-      else
-        patterns << "^ #{pattern}[#{mis},#{ins},#{del}]"
-      end
-    end
-
-    compile(patterns)
-  end
-
-  def self.create_right(adaptor, misp, insp, delp, len)
-    patterns = []
-
-    (adaptor.length).downto(len) do |i|
-      pattern = adaptor[0 ... i]
-      mis     = (pattern.length * misp * 0.01).round
-      ins     = (pattern.length * insp * 0.01).round
-      del     = (pattern.length * delp * 0.01).round
-
-      if i == adaptor.length
-        patterns << "#{pattern}[#{mis},#{ins},#{del}]"
-      else
-        patterns << "#{pattern}[#{mis},#{ins},#{del}] $"
-      end
-    end
-
-    compile(patterns)
-  end
-
-  private
-
-  def self.compile(patterns)
-    new_patterns = []
-
-    while patterns.size > 1
-      new_patterns = patterns[0 ... -2] 
-      new_patterns << "( #{patterns[-2 .. -1].join(' | ')} )"
-
-      patterns = new_patterns
-    end
-
-    patterns.first
-  end
-end
+include BackTrack
 
 casts = []
-casts << {:long=>'forward',       :short=>'f', :type=>'string', :mandatory=>false, :default=>nil,        :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'forward_rc',    :short=>'F', :type=>'string', :mandatory=>false, :default=>nil,        :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'reverse',       :short=>'r', :type=>'string', :mandatory=>false, :default=>nil,        :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'reverse_rc',    :short=>'R', :type=>'string', :mandatory=>false, :default=>nil,        :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'len_forward',   :short=>'l', :type=>'uint',   :mandatory=>false, :default=>nil,        :allowed=>nil, :disallowed=>'0'}
-casts << {:long=>'len_reverse',   :short=>'L', :type=>'uint',   :mandatory=>false, :default=>nil,        :allowed=>nil, :disallowed=>'0'}
-casts << {:long=>'mismatches',    :short=>'m', :type=>'uint',   :mandatory=>false, :default=>10,         :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'insertions',    :short=>'i', :type=>'uint',   :mandatory=>false, :default=>5,          :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'deletions',     :short=>'d', :type=>'uint',   :mandatory=>false, :default=>5,          :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'cpus',          :short=>'c', :type=>'uint',   :mandatory=>false, :default=>1,          :allowed=>nil, :disallowed=>'0'}
-casts << {:long=>'bases_per_file',:short=>'b', :type=>'uint',   :mandatory=>false, :default=>10_000_000, :allowed=>nil, :disallowed=>'0'}
+casts << {:long=>'forward',     :short=>'f', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'forward_rc',  :short=>'F', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'reverse',     :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'reverse_rc',  :short=>'R', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'len_forward', :short=>'l', :type=>'uint',   :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'}
+casts << {:long=>'len_reverse', :short=>'L', :type=>'uint',   :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'}
+casts << {:long=>'mismatches',  :short=>'m', :type=>'uint',   :mandatory=>false, :default=>10,  :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'insertions',  :short=>'i', :type=>'uint',   :mandatory=>false, :default=>5,   :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'deletions',   :short=>'d', :type=>'uint',   :mandatory=>false, :default=>5,   :allowed=>nil, :disallowed=>nil}
 
 options = Biopieces.options_parse(ARGV, casts)
 
@@ -200,6 +68,10 @@ if options[:forward]
   if options[:len_forward] > options[:forward].length
     raise ArgumentError, "len_forward > forward adaptor (#{options[:len_forward]} > #{options[:forward].length})" 
   end
+
+  fmis = percent2real(options[:forward].length, options[:mismatches])
+  fins = percent2real(options[:forward].length, options[:insertions])
+  fdel = percent2real(options[:forward].length, options[:deletions])
 end
 
 if options[:reverse]
@@ -208,99 +80,81 @@ if options[:reverse]
   if options[:len_reverse] > options[:reverse].length
     raise ArgumentError, "len_reverse > reverse adaptor (#{options[:len_reverse]} > #{options[:reverse].length})"
   end
-end
-
-tmpdir       = Biopieces.mktmpdir
-file_records = File.join(tmpdir, "data.stream")
-
-count_file = 0
-count_seq  = 0
-bases      = 0
 
-Biopieces.open(options[:stream_in], file_records) do |input, output|
-  file_fasta = File.join(tmpdir, "#{count_file}.fna")
-  out_fa     = Fasta.open(file_fasta, 'w')
+  rmis = percent2real(options[:reverse].length, options[:mismatches])
+  rins = percent2real(options[:reverse].length, options[:insertions])
+  rdel = percent2real(options[:reverse].length, options[:deletions])
+end
 
+Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
   input.each do |record|
-    output.puts record
-
     if record[:SEQ] and record[:SEQ].length > 0
-      record[:SEQ_NAME] = count_seq
-
-      seq = Seq.new_bp(record)
-
-      out_fa.puts seq.to_fasta
-
-      count_seq += 1;
-      bases      += record[:SEQ].length
-
-      if bases > options[:bases_per_file]
-        out_fa.close
-        bases       = 0
-        count_file += 1
-        file_fasta  = File.join(tmpdir, "#{count_file}.fna")
-        out_fa      = Fasta.open(file_fasta, 'w')
+      entry = Seq.new_bp(record)
+
+      if options[:forward]
+        if m = entry.patmatch(options[:forward], 0, fmis, fins, fdel)
+          record[:ADAPTOR_POS_LEFT] = m.pos
+          record[:ADAPTOR_LEN_LEFT] = m.length
+          record[:ADAPTOR_PAT_LEFT] = m.match
+        elsif options[:len_forward] < options[:forward].length
+          len = options[:forward].length - 1
+          pat = options[:forward]
+
+          while len >= options[:len_forward]
+            fmis = percent2real(len, options[:mismatches])
+            fins = percent2real(len, options[:insertions])
+            fdel = percent2real(len, options[:deletions])
+
+            pat = pat[1 ... pat.length]
+$stderr.puts pat
+$stderr.puts entry.seq[0 ... len]
+            if m = entry.patmatch(pat, [0, len], fmis, fins, fdel)
+              record[:ADAPTOR_POS_LEFT] = m.pos
+              record[:ADAPTOR_LEN_LEFT] = m.length
+              record[:ADAPTOR_PAT_LEFT] = m.match
+
+              break
+            end
+
+            len -= 1
+          end
+        end
       end
-    end
-  end
-
-  out_fa.close if out_fa.respond_to? :close
-end
 
-patscan       = PatScan.new(tmpdir, options[:cpus])
-matches_left  = {}
-matches_right = {}
+      if options[:reverse]
+        if m = entry.patmatch(options[:reverse], 0, rmis, rins, rdel)
+          record[:ADAPTOR_POS_RIGHT] = m.pos
+          record[:ADAPTOR_LEN_RIGHT] = m.length
+          record[:ADAPTOR_PAT_RIGHT] = m.match
+        elsif options[:len_reverse] < options[:reverse].length
+          len = options[:reverse].length - 1
+          pat = options[:reverse]
 
-if options[:forward]
-  pat_left = Pattern.create_left( 
-    options[:forward],
-    options[:mismatches],
-    options[:insertions],
-    options[:deletions],
-    options[:len_forward])
-
-  patscan.run(pat_left)
-  matches_left = patscan.parse_results
-end
+          while len >= options[:len_reverse]
+            rmis = percent2real(len, options[:mismatches])
+            rins = percent2real(len, options[:insertions])
+            rdel = percent2real(len, options[:deletions])
 
-if options[:reverse]
-  patscan.clean
-  pat_right = Pattern.create_right(
-    options[:reverse],
-    options[:mismatches],
-    options[:insertions],
-    options[:deletions],
-    options[:len_reverse])
-
-  patscan.run(pat_right)
-  matches_right = patscan.parse_results
-end
+            pat = pat[0 ... pat.length - 1]
 
-count_seq = 0
+            if m = entry.patmatch(pat, entry.length - len, rmis, rins, rdel)
+              record[:ADAPTOR_POS_RIGHT] = m.pos
+              record[:ADAPTOR_LEN_RIGHT] = m.length
+              record[:ADAPTOR_PAT_RIGHT] = m.match
 
-Biopieces.open(file_records, options[:stream_out]) do |input, output|
-  input.each_record do |record|
-    if record[:SEQ]
-      if matches_left[count_seq]
-        record[:ADAPTOR_POS_LEFT] = matches_left[count_seq].first
-        record[:ADAPTOR_LEN_LEFT] = matches_left[count_seq].last
-        record[:ADAPTOR_PAT_LEFT] = record[:SEQ][record[:ADAPTOR_POS_LEFT] ... record[:ADAPTOR_POS_LEFT] + record[:ADAPTOR_LEN_LEFT]]
-      end
+              break
+            end
 
-      if matches_right[count_seq]
-        record[:ADAPTOR_POS_RIGHT] = matches_right[count_seq].first
-        record[:ADAPTOR_LEN_RIGHT] = matches_right[count_seq].last
-        record[:ADAPTOR_PAT_RIGHT] = record[:SEQ][record[:ADAPTOR_POS_RIGHT] ... record[:ADAPTOR_POS_RIGHT] + record[:ADAPTOR_LEN_RIGHT]]
+            len -= 1
+          end
+        end
       end
-
-      count_seq += 1;
     end
 
     output.puts record
   end
 end
 
-
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<