From ba0a1611781f00a40515125cb45dae2c3de002d6 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Mon, 3 Sep 2012 11:30:10 +0000 Subject: [PATCH] revamped find_adaptor and clip_adaptor git-svn-id: http://biopieces.googlecode.com/svn/trunk@1909 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/clip_adaptor | 17 ++- bp_bin/find_adaptor | 252 +++++++++++++++++++++++++++----------------- 2 files changed, 167 insertions(+), 102 deletions(-) diff --git a/bp_bin/clip_adaptor b/bp_bin/clip_adaptor index 5cb2fa5..0589b27 100755 --- a/bp_bin/clip_adaptor +++ b/bp_bin/clip_adaptor @@ -30,6 +30,7 @@ require 'maasha/biopieces' +require 'maasha/seq' casts = [] @@ -37,10 +38,18 @@ options = Biopieces.options_parse(ARGV, casts) Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| input.each_record do |record| - if record.has_key? :SEQ and record.has_key? :ADAPTOR_POS - record[:SEQ] = record[:SEQ][0 ... record[:ADAPTOR_POS].to_i] - record[:SCORES] = record[:SCORES][0 ... record[:ADAPTOR_POS].to_i] if record[:SCORES] - record[:SEQ_LEN] = record[:SEQ].length + if record[:SEQ] and (record[:ADAPTOR_POS_LEFT] or record[:ADAPTOR_POS_RIGHT]) + entry = Seq.new_bp(record) + + if record[:ADAPTOR_POS_RIGHT] + entry.subseq!(0, record[:ADAPTOR_POS_RIGHT].to_i) + end + + if record[:ADAPTOR_POS_LEFT] + entry.subseq!(record[:ADAPTOR_POS_LEFT].to_i + record[:ADAPTOR_LEN_LEFT].to_i) + end + + record.merge! entry.to_bp end output.puts record diff --git a/bp_bin/find_adaptor b/bp_bin/find_adaptor index 7ba21a2..9b6a3d2 100755 --- a/bp_bin/find_adaptor +++ b/bp_bin/find_adaptor @@ -1,6 +1,6 @@ #!/usr/bin/env ruby -# Copyright (C) 2007-2011 Martin A. Hansen. +# Copyright (C) 2007-2012 Martin A. Hansen. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -37,28 +37,29 @@ require 'maasha/fasta' class PatScanError < StandardError; end; class PatScan - def initialize(options, tmpdir, file_pattern, cpus) - @options = options + def initialize(tmpdir, cpus) @tmpdir = tmpdir - @file_pattern = file_pattern @cpus = cpus @files_fasta = Dir.glob(File.join(@tmpdir, "*.fna")) - - pat = Pattern.new(@options) - pat.write(@file_pattern) end - def run + def run(pattern) + file_pat = File.join(@tmpdir, "pat.txt") + + File.open(file_pat, 'w') do |ios| + ios.puts pattern + end + child_count = 0 - ch_mutex = Mutex.new - threads = [] + ch_mutex = Mutex.new + threads = [] - @files_fasta.each do |file| + @files_fasta.each do |file_fasta| Thread.pass while child_count >= @cpus ch_mutex.synchronize { child_count += 1 } threads << Thread.new do - command = command_compile(file) + command = command_compile(file_pat, file_fasta) system(command) raise PatScanError, "Command failed: #{command}" unless $?.success? ch_mutex.synchronize { child_count -= 1 } @@ -68,16 +69,6 @@ class PatScan threads.each { |t| t.join } end - def command_compile(file) - commands = [] - commands << "nice -n 19" - commands << "scan_for_matches" - commands << @file_pattern - commands << "< #{file}" - commands << "> #{file}.out" - command = commands.join(" ") - end - def parse_results files_result = Dir.glob(File.join(@tmpdir, "*.out")) @@ -90,7 +81,7 @@ class PatScan name = $1.to_i start = $2.to_i - 1 stop = $3.to_i - 1 - matches[name] = [start, stop - start + 1] unless matches.has_key? name + matches[name] = [start, stop - start + 1] unless matches[name] else raise "Failed to parse sequence name: #{entry.seq_name}" end @@ -100,123 +91,156 @@ class PatScan matches end + + def clean + Dir.glob(File.join(@tmpdir, "*.out")).each do |file| + File.unlink(file) + end + end + + private + + def command_compile(file_pat, file_fasta) + commands = [] + commands << "nice -n 19" + commands << "scan_for_matches" + commands << file_pat + commands << "< #{file_fasta}" + commands << "> #{file_fasta}.out" + command = commands.join(" ") + end end # Error class for Pattern errors. class PatternError < StandardError; end; class Pattern - def initialize(options) - @options = options - @patterns = [] - @patterns << pattern_internal - @patterns += patterns_end if @options[:partial] - end - - def to_i - new_patterns = [] + def self.create_left(adaptor, misp, insp, delp, len) + patterns = [] - while @patterns.size > 1 - new_patterns = @patterns[0 ... -2] - new_patterns << "( #{@patterns[-2 .. -1].join(' | ')} )" + (adaptor.length).downto(len) do |i| + pattern = adaptor[adaptor.length - i ... adaptor.length] + mis = (pattern.length * misp * 0.01).round + ins = (pattern.length * insp * 0.01).round + del = (pattern.length * delp * 0.01).round - @patterns = new_patterns + if i == adaptor.length + patterns << "#{pattern}[#{mis},#{ins},#{del}]" + else + patterns << "^ #{pattern}[#{mis},#{ins},#{del}]" + end end - @patterns.first + compile(patterns) end - def write(file) - File.open(file, 'w') do |ios| - ios.puts self.to_i + def self.create_right(adaptor, misp, insp, delp, len) + patterns = [] + + (adaptor.length).downto(len) do |i| + pattern = adaptor[0 ... i] + mis = (pattern.length * misp * 0.01).round + ins = (pattern.length * insp * 0.01).round + del = (pattern.length * delp * 0.01).round + + if i == adaptor.length + patterns << "#{pattern}[#{mis},#{ins},#{del}]" + else + patterns << "#{pattern}[#{mis},#{ins},#{del}] $" + end end + + compile(patterns) end private - def pattern_internal - pattern = @options[:adaptor] - mis = mis_count(pattern) - ins = ins_count(pattern) - del = del_count(pattern) + def self.compile(patterns) + new_patterns = [] + + while patterns.size > 1 + new_patterns = patterns[0 ... -2] + new_patterns << "( #{patterns[-2 .. -1].join(' | ')} )" + + patterns = new_patterns + end - "#{pattern}[#{mis},#{ins},#{del}]" + patterns.first end +end - def patterns_end - patterns = [] - adaptor = @options[:adaptor] +casts = [] +casts << {:long=>'forward', :short=>'f', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'forward_rc', :short=>'F', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'reverse', :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'reverse_rc', :short=>'R', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'len_forward', :short=>'l', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'} +casts << {:long=>'len_reverse', :short=>'L', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'} +casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'insertions', :short=>'i', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'deletions', :short=>'d', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'cpus', :short=>'c', :type=>'uint', :mandatory=>false, :default=>1, :allowed=>nil, :disallowed=>'0'} - raise PatternError, "len > adaptor length: #{@options[:len]} > #{adaptor.length - 1}" if @options[:len] > adaptor.length - 1 +BASE_PER_FILE = 10_000_000 - (adaptor.length - 1).downto(@options[:len]) do |i| - pattern = adaptor[0 ... i] - mis = mis_count(pattern) - ins = ins_count(pattern) - del = del_count(pattern) - patterns << "#{pattern}[#{mis},#{ins},#{del}] $" - end +options = Biopieces.options_parse(ARGV, casts) - patterns - end +if options[:forward_rc] + options[:forward] = Seq.new("test", options[:forward_rc], 'dna').revcomp.seq +end - def mis_count(pattern) - (pattern.length * @options[:mismatches] * 0.01).round - end +if options[:reverse_rc] + options[:reverse] = Seq.new("test", options[:reverse_rc], 'dna').revcomp.seq +end - def ins_count(pattern) - (pattern.length * @options[:insertions] * 0.01).round - end +raise ArgumentError, "no adaptor specified" unless options[:forward] or options[:reverse] + +if options[:forward] + options[:len_forward] = options[:forward].length unless options[:len_forward] - def del_count(pattern) - (pattern.length * @options[:deletions] * 0.01).round + if options[:len_forward] > options[:forward].length + raise ArgumentError, "len_forward > forward adaptor (#{options[:len_forward]} > #{options[:forward].length})" end end -casts = [] -casts << {:long=>'adaptor', :short=>'a', :type=>'string', :mandatory=>true, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'partial', :short=>'p', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'len', :short=>'l', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>'0'} -casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'insertions', :short=>'i', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'deletions', :short=>'d', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'cpus', :short=>'c', :type=>'uint', :mandatory=>false, :default=>1, :allowed=>nil, :disallowed=>'0'} - -BASE_PER_FILE = 10_000_000 +if options[:reverse] + options[:len_reverse] = options[:reverse].length unless options[:len_reverse] -options = Biopieces.options_parse(ARGV, casts) + if options[:len_reverse] > options[:reverse].length + raise ArgumentError, "len_reverse > reverse adaptor (#{options[:len_reverse]} > #{options[:reverse].length})" + end +end tmpdir = Biopieces.mktmpdir file_records = File.join(tmpdir, "data.stream") -file_pattern = File.join(tmpdir, "pattern.txt") -number_file = 0 -number_seq = 0 -bases = 0 +count_file = 0 +count_seq = 0 +bases = 0 Biopieces.open(options[:stream_in], file_records) do |input, output| - file_fasta = File.join(tmpdir, "#{number_file}.fna") + file_fasta = File.join(tmpdir, "#{count_file}.fna") out_fa = Fasta.open(file_fasta, 'w') input.each do |record| output.puts record if record[:SEQ] and record[:SEQ].length > 0 - record[:SEQ_NAME] = number_seq + record[:SEQ_NAME] = count_seq seq = Seq.new_bp(record) out_fa.puts seq.to_fasta - number_seq += 1; + count_seq += 1; bases += record[:SEQ].length if bases > BASE_PER_FILE out_fa.close - bases = 0 - number_file += 1 - file_fasta = File.join(tmpdir, "#{number_file}.fna") - out_fa = Fasta.open(file_fasta, 'w') + bases = 0 + count_file += 1 + file_fasta = File.join(tmpdir, "#{count_file}.fna") + out_fa = Fasta.open(file_fasta, 'w') end end end @@ -224,21 +248,53 @@ Biopieces.open(options[:stream_in], file_records) do |input, output| out_fa.close if out_fa.respond_to? :close end -patscan = PatScan.new(options, tmpdir, file_pattern, options[:cpus]) -patscan.run -matches = patscan.parse_results +patscan = PatScan.new(tmpdir, options[:cpus]) +matches_left = {} +matches_right = {} -number_seq = 0 +if options[:forward] + pat_left = Pattern.create_left( + options[:forward], + options[:mismatches], + options[:insertions], + options[:deletions], + options[:len_forward]) + + patscan.run(pat_left) + matches_left = patscan.parse_results +end + +if options[:reverse] + patscan.clean + pat_right = Pattern.create_right( + options[:reverse], + options[:mismatches], + options[:insertions], + options[:deletions], + options[:len_reverse]) + + patscan.run(pat_right) + matches_right = patscan.parse_results +end + +count_seq = 0 Biopieces.open(file_records, options[:stream_out]) do |input, output| input.each_record do |record| - if record.has_key? :SEQ - if matches.has_key? number_seq - record[:ADAPTOR_POS] = matches[number_seq].first - record[:ADAPTOR_LEN] = matches[number_seq].last + if record[:SEQ] + if matches_left[count_seq] + record[:ADAPTOR_POS_LEFT] = matches_left[count_seq].first + record[:ADAPTOR_LEN_LEFT] = matches_left[count_seq].last + record[:ADAPTOR_PAT_LEFT] = record[:SEQ][record[:ADAPTOR_POS_LEFT] ... record[:ADAPTOR_POS_LEFT] + record[:ADAPTOR_LEN_LEFT]] + end + + if matches_right[count_seq] + record[:ADAPTOR_POS_RIGHT] = matches_right[count_seq].first + record[:ADAPTOR_LEN_RIGHT] = matches_right[count_seq].last + record[:ADAPTOR_PAT_RIGHT] = record[:SEQ][record[:ADAPTOR_POS_RIGHT] ... record[:ADAPTOR_POS_RIGHT] + record[:ADAPTOR_LEN_RIGHT]] end - number_seq += 1; + count_seq += 1; end output.puts record -- 2.39.5