]> git.donarmstrong.com Git - biopieces.git/blobdiff - bp_bin/find_adaptor
upgraded write_fixedstep
[biopieces.git] / bp_bin / find_adaptor
index 53793095e7998b4c82d91b503025d756fd811611..290af5a19dfa5da0614f0ea911ec1c6c636bf63c 100755 (executable)
@@ -37,40 +37,63 @@ require 'maasha/fasta'
 class PatScanError < StandardError; end;
 
 class PatScan
-  def initialize(options, file_fasta, file_pattern, file_patscan)
+  def initialize(options, tmpdir, file_pattern, cpus)
     @options      = options
-    @file_fasta   = file_fasta
+               @tmpdir       = tmpdir
     @file_pattern = file_pattern
-    @file_patscan = file_patscan
+    @cpus         = cpus
+    @files_fasta  = Dir.glob(File.join(@tmpdir, "*.fna"))
 
     pat = Pattern.new(@options)
     pat.write(@file_pattern)
   end
 
   def run
+    child_count = 0
+    ch_mutex = Mutex.new
+    threads = []
+
+    @files_fasta.each do |file|
+      Thread.pass while child_count >= @cpus
+      ch_mutex.synchronize { child_count += 1 }
+
+      threads << Thread.new do
+        command = command_compile(file)
+        system(command)
+        raise PatScanError, "Command failed: #{command}" unless $?.success?
+        ch_mutex.synchronize { child_count -= 1 }
+      end
+    end
+
+    threads.each { |t| t.join }
+  end
+
+  def command_compile(file)
     commands = []
     commands << "nice -n 19"
     commands << "scan_for_matches"
     commands << @file_pattern
-    commands << "< #{@file_fasta}"
-    commands << "> #{@file_patscan}"
+    commands << "< #{file}"
+    commands << "> #{file}.out"
     command = commands.join(" ")
-    system(command)
-    raise PatScanError, "Command failed: #{command}" unless $?.success?
   end
 
   def parse_results
+    files_result = Dir.glob(File.join(@tmpdir, "*.out"))
+
     matches = {}
 
-    Fasta.open(@file_patscan, mode='r') do |ios|
-      ios.each do |entry|
-        if entry.seq_name =~ /^(\d+):\[(\d+),(\d+)\]$/
-          name  = $1.to_i
-          start = $2.to_i - 1
-          stop  = $3.to_i - 1
-          matches[name] = [start, stop - start + 1] unless matches.has_key? name
-        else
-          raise "Failed to parse sequence name: #{entry.seq_name}"
+    files_result.each do |file|
+      Fasta.open(file, 'r') do |ios|
+        ios.each do |entry|
+          if entry.seq_name =~ /^(\d+):\[(\d+),(\d+)\]$/
+            name  = $1.to_i
+            start = $2.to_i - 1
+            stop  = $3.to_i - 1
+            matches[name] = [start, stop - start + 1] unless matches.has_key? name
+          else
+            raise "Failed to parse sequence name: #{entry.seq_name}"
+          end
         end
       end
     end
@@ -104,7 +127,7 @@ class Pattern
   end
 
   def write(file)
-    File.open(file, mode='w') do |ios|
+    File.open(file, 'w') do |ios|
       ios.puts self.to_i
     end
   end
@@ -157,47 +180,65 @@ casts << {:long=>'len',        :short=>'l', :type=>'uint',   :mandatory=>false,
 casts << {:long=>'mismatches', :short=>'m', :type=>'uint',   :mandatory=>false, :default=>10,  :allowed=>nil, :disallowed=>nil}
 casts << {:long=>'insertions', :short=>'i', :type=>'uint',   :mandatory=>false, :default=>5,   :allowed=>nil, :disallowed=>nil}
 casts << {:long=>'deletions',  :short=>'d', :type=>'uint',   :mandatory=>false, :default=>5,   :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'cpus',       :short=>'c', :type=>'uint',   :mandatory=>false, :default=>1,   :allowed=>nil, :disallowed=>'0'}
+
+BASE_PER_FILE = 10_000_000
 
 options = Biopieces.options_parse(ARGV, casts)
 
 tmpdir       = Biopieces.mktmpdir
-file_fasta   = File.join(tmpdir, "data.fna")
 file_records = File.join(tmpdir, "data.stream")
 file_pattern = File.join(tmpdir, "pattern.txt")
-file_patscan = File.join(tmpdir, "patscan.fna")
 
-count = 0
+number_file = 0
+number_seq  = 0
+bases       = 0
 
 Biopieces.open(options[:stream_in], file_records) do |input, output|
-  Fasta.open(file_fasta, mode='w') do |out_fa|
-    input.each do |record|
-      output.puts record
+  file_fasta = File.join(tmpdir, "#{number_file}.fna")
+  out_fa     = Fasta.open(file_fasta, 'w')
 
-      if record.has_key? :SEQ
-        record[:SEQ_NAME] = count
-        out_fa.puts record
+  input.each do |record|
+    output.puts record
+
+    if record.has_key? :SEQ
+      record[:SEQ_NAME] = number_seq
+
+      seq = Seq.new_bp(record)
 
-        count += 1;
+      out_fa.puts seq.to_fasta
+
+      number_seq += 1;
+      bases      += record[:SEQ].length
+
+      if bases > BASE_PER_FILE
+        out_fa.close
+        bases = 0
+        number_file += 1
+        file_fasta = File.join(tmpdir, "#{number_file}.fna")
+        out_fa     = Fasta.open(file_fasta, 'w')
       end
     end
   end
+
+  out_fa.close if out_fa.respond_to? :close
 end
 
-patscan = PatScan.new(options, file_fasta, file_pattern, file_patscan)
+patscan = PatScan.new(options, tmpdir, file_pattern, options[:cpus])
 patscan.run
 matches = patscan.parse_results
 
-count = 0
+number_seq = 0
 
 Biopieces.open(file_records, options[:stream_out]) do |input, output|
   input.each_record do |record|
     if record.has_key? :SEQ
-      if matches.has_key? count
-        record[:ADAPTOR_POS] = matches[count].first
-        record[:ADAPTOR_LEN] = matches[count].last
+      if matches.has_key? number_seq
+        record[:ADAPTOR_POS] = matches[number_seq].first
+        record[:ADAPTOR_LEN] = matches[number_seq].last
       end
 
-      count += 1;
+      number_seq += 1;
     end
 
     output.puts record