]> git.donarmstrong.com Git - biopieces.git/commitdiff
revamped find_adaptor and clip_adaptor
authormartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Mon, 3 Sep 2012 11:30:10 +0000 (11:30 +0000)
committermartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Mon, 3 Sep 2012 11:30:10 +0000 (11:30 +0000)
git-svn-id: http://biopieces.googlecode.com/svn/trunk@1909 74ccb610-7750-0410-82ae-013aeee3265d

bp_bin/clip_adaptor
bp_bin/find_adaptor

index 5cb2fa5ae083756c629849d08502c3ce9711386a..0589b27d0c109bf551d8ff4db07e42192ec3ee99 100755 (executable)
@@ -30,6 +30,7 @@
 
 
 require 'maasha/biopieces'
+require 'maasha/seq'
 
 casts = []
 
@@ -37,10 +38,18 @@ options = Biopieces.options_parse(ARGV, casts)
 
 Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
   input.each_record do |record|
-    if record.has_key? :SEQ and record.has_key? :ADAPTOR_POS
-      record[:SEQ]     = record[:SEQ][0 ... record[:ADAPTOR_POS].to_i]
-      record[:SCORES]  = record[:SCORES][0 ... record[:ADAPTOR_POS].to_i] if record[:SCORES]
-      record[:SEQ_LEN] = record[:SEQ].length
+    if record[:SEQ] and (record[:ADAPTOR_POS_LEFT] or record[:ADAPTOR_POS_RIGHT])
+      entry = Seq.new_bp(record)
+
+      if record[:ADAPTOR_POS_RIGHT]
+        entry.subseq!(0, record[:ADAPTOR_POS_RIGHT].to_i)
+      end
+
+      if record[:ADAPTOR_POS_LEFT]
+        entry.subseq!(record[:ADAPTOR_POS_LEFT].to_i + record[:ADAPTOR_LEN_LEFT].to_i)
+      end
+
+      record.merge! entry.to_bp
     end
 
     output.puts record
index 7ba21a24fa6a921e411b1820f58488775e614bbd..9b6a3d215884bb96ccf36e38f37839f9ca6df2b9 100755 (executable)
@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 
-# Copyright (C) 2007-2011 Martin A. Hansen.
+# Copyright (C) 2007-2012 Martin A. Hansen.
 
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
@@ -37,28 +37,29 @@ require 'maasha/fasta'
 class PatScanError < StandardError; end;
 
 class PatScan
-  def initialize(options, tmpdir, file_pattern, cpus)
-    @options      = options
+  def initialize(tmpdir, cpus)
                @tmpdir       = tmpdir
-    @file_pattern = file_pattern
     @cpus         = cpus
     @files_fasta  = Dir.glob(File.join(@tmpdir, "*.fna"))
-
-    pat = Pattern.new(@options)
-    pat.write(@file_pattern)
   end
 
-  def run
+  def run(pattern)
+    file_pat = File.join(@tmpdir, "pat.txt")
+
+    File.open(file_pat, 'w') do |ios|
+      ios.puts pattern
+    end
+
     child_count = 0
-    ch_mutex = Mutex.new
-    threads = []
+    ch_mutex    = Mutex.new
+    threads     = []
 
-    @files_fasta.each do |file|
+    @files_fasta.each do |file_fasta|
       Thread.pass while child_count >= @cpus
       ch_mutex.synchronize { child_count += 1 }
 
       threads << Thread.new do
-        command = command_compile(file)
+        command = command_compile(file_pat, file_fasta)
         system(command)
         raise PatScanError, "Command failed: #{command}" unless $?.success?
         ch_mutex.synchronize { child_count -= 1 }
@@ -68,16 +69,6 @@ class PatScan
     threads.each { |t| t.join }
   end
 
-  def command_compile(file)
-    commands = []
-    commands << "nice -n 19"
-    commands << "scan_for_matches"
-    commands << @file_pattern
-    commands << "< #{file}"
-    commands << "> #{file}.out"
-    command = commands.join(" ")
-  end
-
   def parse_results
     files_result = Dir.glob(File.join(@tmpdir, "*.out"))
 
@@ -90,7 +81,7 @@ class PatScan
             name  = $1.to_i
             start = $2.to_i - 1
             stop  = $3.to_i - 1
-            matches[name] = [start, stop - start + 1] unless matches.has_key? name
+            matches[name] = [start, stop - start + 1] unless matches[name]
           else
             raise "Failed to parse sequence name: #{entry.seq_name}"
           end
@@ -100,123 +91,156 @@ class PatScan
 
     matches
   end
+
+  def clean
+    Dir.glob(File.join(@tmpdir, "*.out")).each do |file|
+      File.unlink(file)
+    end
+  end
+
+  private
+
+  def command_compile(file_pat, file_fasta)
+    commands = []
+    commands << "nice -n 19"
+    commands << "scan_for_matches"
+    commands << file_pat
+    commands << "< #{file_fasta}"
+    commands << "> #{file_fasta}.out"
+    command = commands.join(" ")
+  end
 end
 
 # Error class for Pattern errors.
 class PatternError < StandardError; end;
 
 class Pattern
-  def initialize(options)
-    @options  = options
-    @patterns = []
-    @patterns << pattern_internal
-    @patterns += patterns_end if @options[:partial]
-  end
-
-  def to_i
-    new_patterns = []
+  def self.create_left(adaptor, misp, insp, delp, len)
+    patterns = []
 
-    while @patterns.size > 1
-      new_patterns = @patterns[0 ... -2] 
-      new_patterns << "( #{@patterns[-2 .. -1].join(' | ')} )"
+    (adaptor.length).downto(len) do |i|
+      pattern = adaptor[adaptor.length - i ... adaptor.length]
+      mis     = (pattern.length * misp * 0.01).round
+      ins     = (pattern.length * insp * 0.01).round
+      del     = (pattern.length * delp * 0.01).round
 
-      @patterns = new_patterns
+      if i == adaptor.length
+        patterns << "#{pattern}[#{mis},#{ins},#{del}]"
+      else
+        patterns << "^ #{pattern}[#{mis},#{ins},#{del}]"
+      end
     end
 
-    @patterns.first
+    compile(patterns)
   end
 
-  def write(file)
-    File.open(file, 'w') do |ios|
-      ios.puts self.to_i
+  def self.create_right(adaptor, misp, insp, delp, len)
+    patterns = []
+
+    (adaptor.length).downto(len) do |i|
+      pattern = adaptor[0 ... i]
+      mis     = (pattern.length * misp * 0.01).round
+      ins     = (pattern.length * insp * 0.01).round
+      del     = (pattern.length * delp * 0.01).round
+
+      if i == adaptor.length
+        patterns << "#{pattern}[#{mis},#{ins},#{del}]"
+      else
+        patterns << "#{pattern}[#{mis},#{ins},#{del}] $"
+      end
     end
+
+    compile(patterns)
   end
 
   private
 
-  def pattern_internal
-    pattern = @options[:adaptor]
-    mis = mis_count(pattern)
-    ins = ins_count(pattern)
-    del = del_count(pattern)
+  def self.compile(patterns)
+    new_patterns = []
+
+    while patterns.size > 1
+      new_patterns = patterns[0 ... -2] 
+      new_patterns << "( #{patterns[-2 .. -1].join(' | ')} )"
+
+      patterns = new_patterns
+    end
 
-    "#{pattern}[#{mis},#{ins},#{del}]"
+    patterns.first
   end
+end
 
-  def patterns_end
-    patterns = []
-    adaptor  = @options[:adaptor]
+casts = []
+casts << {:long=>'forward',     :short=>'f', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'forward_rc',  :short=>'F', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'reverse',     :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'reverse_rc',  :short=>'R', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'len_forward', :short=>'l', :type=>'uint',   :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'}
+casts << {:long=>'len_reverse', :short=>'L', :type=>'uint',   :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'}
+casts << {:long=>'mismatches',  :short=>'m', :type=>'uint',   :mandatory=>false, :default=>10,  :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'insertions',  :short=>'i', :type=>'uint',   :mandatory=>false, :default=>5,   :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'deletions',   :short=>'d', :type=>'uint',   :mandatory=>false, :default=>5,   :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'cpus',        :short=>'c', :type=>'uint',   :mandatory=>false, :default=>1,   :allowed=>nil, :disallowed=>'0'}
 
-    raise PatternError, "len > adaptor length: #{@options[:len]} > #{adaptor.length - 1}" if @options[:len] > adaptor.length - 1
+BASE_PER_FILE = 10_000_000
 
-    (adaptor.length - 1).downto(@options[:len]) do |i|
-      pattern = adaptor[0 ... i]
-      mis = mis_count(pattern)
-      ins = ins_count(pattern)
-      del = del_count(pattern)
-      patterns << "#{pattern}[#{mis},#{ins},#{del}] $"
-    end
+options = Biopieces.options_parse(ARGV, casts)
 
-    patterns
-  end
+if options[:forward_rc]
+  options[:forward] = Seq.new("test", options[:forward_rc], 'dna').revcomp.seq
+end
 
-  def mis_count(pattern)
-    (pattern.length * @options[:mismatches] * 0.01).round
-  end
+if options[:reverse_rc]
+  options[:reverse] = Seq.new("test", options[:reverse_rc], 'dna').revcomp.seq
+end
 
-  def ins_count(pattern)
-    (pattern.length * @options[:insertions] * 0.01).round
-  end
+raise ArgumentError, "no adaptor specified" unless options[:forward] or options[:reverse]
+
+if options[:forward]
+  options[:len_forward] = options[:forward].length unless options[:len_forward]
 
-  def del_count(pattern)
-    (pattern.length * @options[:deletions] * 0.01).round
+  if options[:len_forward] > options[:forward].length
+    raise ArgumentError, "len_forward > forward adaptor (#{options[:len_forward]} > #{options[:forward].length})" 
   end
 end
 
-casts = []
-casts << {:long=>'adaptor',    :short=>'a', :type=>'string', :mandatory=>true,  :default=>nil, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'partial',    :short=>'p', :type=>'flag',   :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'len',        :short=>'l', :type=>'uint',   :mandatory=>false, :default=>10,  :allowed=>nil, :disallowed=>'0'}
-casts << {:long=>'mismatches', :short=>'m', :type=>'uint',   :mandatory=>false, :default=>10,  :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'insertions', :short=>'i', :type=>'uint',   :mandatory=>false, :default=>5,   :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'deletions',  :short=>'d', :type=>'uint',   :mandatory=>false, :default=>5,   :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'cpus',       :short=>'c', :type=>'uint',   :mandatory=>false, :default=>1,   :allowed=>nil, :disallowed=>'0'}
-
-BASE_PER_FILE = 10_000_000
+if options[:reverse]
+  options[:len_reverse] = options[:reverse].length unless options[:len_reverse]
 
-options = Biopieces.options_parse(ARGV, casts)
+  if options[:len_reverse] > options[:reverse].length
+    raise ArgumentError, "len_reverse > reverse adaptor (#{options[:len_reverse]} > #{options[:reverse].length})"
+  end
+end
 
 tmpdir       = Biopieces.mktmpdir
 file_records = File.join(tmpdir, "data.stream")
-file_pattern = File.join(tmpdir, "pattern.txt")
 
-number_file = 0
-number_seq  = 0
-bases       = 0
+count_file = 0
+count_seq  = 0
+bases      = 0
 
 Biopieces.open(options[:stream_in], file_records) do |input, output|
-  file_fasta = File.join(tmpdir, "#{number_file}.fna")
+  file_fasta = File.join(tmpdir, "#{count_file}.fna")
   out_fa     = Fasta.open(file_fasta, 'w')
 
   input.each do |record|
     output.puts record
 
     if record[:SEQ] and record[:SEQ].length > 0
-      record[:SEQ_NAME] = number_seq
+      record[:SEQ_NAME] = count_seq
 
       seq = Seq.new_bp(record)
 
       out_fa.puts seq.to_fasta
 
-      number_seq += 1;
+      count_seq += 1;
       bases      += record[:SEQ].length
 
       if bases > BASE_PER_FILE
         out_fa.close
-        bases = 0
-        number_file += 1
-        file_fasta = File.join(tmpdir, "#{number_file}.fna")
-        out_fa     = Fasta.open(file_fasta, 'w')
+        bases       = 0
+        count_file += 1
+        file_fasta  = File.join(tmpdir, "#{count_file}.fna")
+        out_fa      = Fasta.open(file_fasta, 'w')
       end
     end
   end
@@ -224,21 +248,53 @@ Biopieces.open(options[:stream_in], file_records) do |input, output|
   out_fa.close if out_fa.respond_to? :close
 end
 
-patscan = PatScan.new(options, tmpdir, file_pattern, options[:cpus])
-patscan.run
-matches = patscan.parse_results
+patscan       = PatScan.new(tmpdir, options[:cpus])
+matches_left  = {}
+matches_right = {}
 
-number_seq = 0
+if options[:forward]
+  pat_left = Pattern.create_left( 
+    options[:forward],
+    options[:mismatches],
+    options[:insertions],
+    options[:deletions],
+    options[:len_forward])
+
+  patscan.run(pat_left)
+  matches_left = patscan.parse_results
+end
+
+if options[:reverse]
+  patscan.clean
+  pat_right = Pattern.create_right(
+    options[:reverse],
+    options[:mismatches],
+    options[:insertions],
+    options[:deletions],
+    options[:len_reverse])
+
+  patscan.run(pat_right)
+  matches_right = patscan.parse_results
+end
+
+count_seq = 0
 
 Biopieces.open(file_records, options[:stream_out]) do |input, output|
   input.each_record do |record|
-    if record.has_key? :SEQ
-      if matches.has_key? number_seq
-        record[:ADAPTOR_POS] = matches[number_seq].first
-        record[:ADAPTOR_LEN] = matches[number_seq].last
+    if record[:SEQ]
+      if matches_left[count_seq]
+        record[:ADAPTOR_POS_LEFT] = matches_left[count_seq].first
+        record[:ADAPTOR_LEN_LEFT] = matches_left[count_seq].last
+        record[:ADAPTOR_PAT_LEFT] = record[:SEQ][record[:ADAPTOR_POS_LEFT] ... record[:ADAPTOR_POS_LEFT] + record[:ADAPTOR_LEN_LEFT]]
+      end
+
+      if matches_right[count_seq]
+        record[:ADAPTOR_POS_RIGHT] = matches_right[count_seq].first
+        record[:ADAPTOR_LEN_RIGHT] = matches_right[count_seq].last
+        record[:ADAPTOR_PAT_RIGHT] = record[:SEQ][record[:ADAPTOR_POS_RIGHT] ... record[:ADAPTOR_POS_RIGHT] + record[:ADAPTOR_LEN_RIGHT]]
       end
 
-      number_seq += 1;
+      count_seq += 1;
     end
 
     output.puts record