#!/usr/bin/env ruby
-# Copyright (C) 2007-2011 Martin A. Hansen.
+# Copyright (C) 2007-2012 Martin A. Hansen.
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
class PatScanError < StandardError; end;
class PatScan
- def initialize(options, tmpdir, file_pattern, cpus)
- @options = options
+ def initialize(tmpdir, cpus)
@tmpdir = tmpdir
- @file_pattern = file_pattern
@cpus = cpus
@files_fasta = Dir.glob(File.join(@tmpdir, "*.fna"))
-
- pat = Pattern.new(@options)
- pat.write(@file_pattern)
end
- def run
+ def run(pattern)
+ file_pat = File.join(@tmpdir, "pat.txt")
+
+ File.open(file_pat, 'w') do |ios|
+ ios.puts pattern
+ end
+
child_count = 0
- ch_mutex = Mutex.new
- threads = []
+ ch_mutex = Mutex.new
+ threads = []
- @files_fasta.each do |file|
+ @files_fasta.each do |file_fasta|
Thread.pass while child_count >= @cpus
ch_mutex.synchronize { child_count += 1 }
threads << Thread.new do
- command = command_compile(file)
+ command = command_compile(file_pat, file_fasta)
system(command)
raise PatScanError, "Command failed: #{command}" unless $?.success?
ch_mutex.synchronize { child_count -= 1 }
threads.each { |t| t.join }
end
- def command_compile(file)
- commands = []
- commands << "nice -n 19"
- commands << "scan_for_matches"
- commands << @file_pattern
- commands << "< #{file}"
- commands << "> #{file}.out"
- command = commands.join(" ")
- end
-
def parse_results
files_result = Dir.glob(File.join(@tmpdir, "*.out"))
name = $1.to_i
start = $2.to_i - 1
stop = $3.to_i - 1
- matches[name] = [start, stop - start + 1] unless matches.has_key? name
+ matches[name] = [start, stop - start + 1] unless matches[name]
else
raise "Failed to parse sequence name: #{entry.seq_name}"
end
matches
end
+
+ def clean
+ Dir.glob(File.join(@tmpdir, "*.out")).each do |file|
+ File.unlink(file)
+ end
+ end
+
+ private
+
+ def command_compile(file_pat, file_fasta)
+ commands = []
+ commands << "nice -n 19"
+ commands << "scan_for_matches"
+ commands << file_pat
+ commands << "< #{file_fasta}"
+ commands << "> #{file_fasta}.out"
+ command = commands.join(" ")
+ end
end
# Error class for Pattern errors.
class PatternError < StandardError; end;
class Pattern
- def initialize(options)
- @options = options
- @patterns = []
- @patterns << pattern_internal
- @patterns += patterns_end if @options[:partial]
- end
-
- def to_i
- new_patterns = []
+ def self.create_left(adaptor, misp, insp, delp, len)
+ patterns = []
- while @patterns.size > 1
- new_patterns = @patterns[0 ... -2]
- new_patterns << "( #{@patterns[-2 .. -1].join(' | ')} )"
+ (adaptor.length).downto(len) do |i|
+ pattern = adaptor[adaptor.length - i ... adaptor.length]
+ mis = (pattern.length * misp * 0.01).round
+ ins = (pattern.length * insp * 0.01).round
+ del = (pattern.length * delp * 0.01).round
- @patterns = new_patterns
+ if i == adaptor.length
+ patterns << "#{pattern}[#{mis},#{ins},#{del}]"
+ else
+ patterns << "^ #{pattern}[#{mis},#{ins},#{del}]"
+ end
end
- @patterns.first
+ compile(patterns)
end
- def write(file)
- File.open(file, 'w') do |ios|
- ios.puts self.to_i
+ def self.create_right(adaptor, misp, insp, delp, len)
+ patterns = []
+
+ (adaptor.length).downto(len) do |i|
+ pattern = adaptor[0 ... i]
+ mis = (pattern.length * misp * 0.01).round
+ ins = (pattern.length * insp * 0.01).round
+ del = (pattern.length * delp * 0.01).round
+
+ if i == adaptor.length
+ patterns << "#{pattern}[#{mis},#{ins},#{del}]"
+ else
+ patterns << "#{pattern}[#{mis},#{ins},#{del}] $"
+ end
end
+
+ compile(patterns)
end
private
- def pattern_internal
- pattern = @options[:adaptor]
- mis = mis_count(pattern)
- ins = ins_count(pattern)
- del = del_count(pattern)
+ def self.compile(patterns)
+ new_patterns = []
+
+ while patterns.size > 1
+ new_patterns = patterns[0 ... -2]
+ new_patterns << "( #{patterns[-2 .. -1].join(' | ')} )"
- "#{pattern}[#{mis},#{ins},#{del}]"
+ patterns = new_patterns
+ end
+
+ patterns.first
end
+end
- def patterns_end
- patterns = []
- adaptor = @options[:adaptor]
+casts = []
+casts << {:long=>'forward', :short=>'f', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'forward_rc', :short=>'F', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'reverse', :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'reverse_rc', :short=>'R', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'len_forward', :short=>'l', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'}
+casts << {:long=>'len_reverse', :short=>'L', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'}
+casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'insertions', :short=>'i', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'deletions', :short=>'d', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'cpus', :short=>'c', :type=>'uint', :mandatory=>false, :default=>1, :allowed=>nil, :disallowed=>'0'}
+casts << {:long=>'bases_per_file',:short=>'b', :type=>'uint', :mandatory=>false, :default=>10_000_000, :allowed=>nil, :disallowed=>'0'}
- raise PatternError, "len > adaptor length: #{@options[:len]} > #{adaptor.length - 1}" if @options[:len] > adaptor.length - 1
+options = Biopieces.options_parse(ARGV, casts)
- (adaptor.length - 1).downto(@options[:len]) do |i|
- pattern = adaptor[0 ... i]
- mis = mis_count(pattern)
- ins = ins_count(pattern)
- del = del_count(pattern)
- patterns << "#{pattern}[#{mis},#{ins},#{del}] $"
- end
+if options[:forward_rc]
+ options[:forward] = Seq.new("test", options[:forward_rc], 'dna').revcomp.seq
+end
- patterns
- end
+if options[:reverse_rc]
+ options[:reverse] = Seq.new("test", options[:reverse_rc], 'dna').revcomp.seq
+end
- def mis_count(pattern)
- (pattern.length * @options[:mismatches] * 0.01).round
- end
+raise ArgumentError, "no adaptor specified" unless options[:forward] or options[:reverse]
- def ins_count(pattern)
- (pattern.length * @options[:insertions] * 0.01).round
- end
+if options[:forward]
+ options[:len_forward] = options[:forward].length unless options[:len_forward]
- def del_count(pattern)
- (pattern.length * @options[:deletions] * 0.01).round
+ if options[:len_forward] > options[:forward].length
+ raise ArgumentError, "len_forward > forward adaptor (#{options[:len_forward]} > #{options[:forward].length})"
end
end
-casts = []
-casts << {:long=>'adaptor', :short=>'a', :type=>'string', :mandatory=>true, :default=>nil, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'partial', :short=>'p', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'len', :short=>'l', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>'0'}
-casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'insertions', :short=>'i', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'deletions', :short=>'d', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'cpus', :short=>'c', :type=>'uint', :mandatory=>false, :default=>1, :allowed=>nil, :disallowed=>'0'}
-
-BASE_PER_FILE = 10_000_000
+if options[:reverse]
+ options[:len_reverse] = options[:reverse].length unless options[:len_reverse]
-options = Biopieces.options_parse(ARGV, casts)
+ if options[:len_reverse] > options[:reverse].length
+ raise ArgumentError, "len_reverse > reverse adaptor (#{options[:len_reverse]} > #{options[:reverse].length})"
+ end
+end
tmpdir = Biopieces.mktmpdir
file_records = File.join(tmpdir, "data.stream")
-file_pattern = File.join(tmpdir, "pattern.txt")
-number_file = 0
-number_seq = 0
-bases = 0
+count_file = 0
+count_seq = 0
+bases = 0
Biopieces.open(options[:stream_in], file_records) do |input, output|
- file_fasta = File.join(tmpdir, "#{number_file}.fna")
+ file_fasta = File.join(tmpdir, "#{count_file}.fna")
out_fa = Fasta.open(file_fasta, 'w')
input.each do |record|
output.puts record
if record[:SEQ] and record[:SEQ].length > 0
- record[:SEQ_NAME] = number_seq
+ record[:SEQ_NAME] = count_seq
seq = Seq.new_bp(record)
out_fa.puts seq.to_fasta
- number_seq += 1;
+ count_seq += 1;
bases += record[:SEQ].length
- if bases > BASE_PER_FILE
+ if bases > options[:bases_per_file]
out_fa.close
- bases = 0
- number_file += 1
- file_fasta = File.join(tmpdir, "#{number_file}.fna")
- out_fa = Fasta.open(file_fasta, 'w')
+ bases = 0
+ count_file += 1
+ file_fasta = File.join(tmpdir, "#{count_file}.fna")
+ out_fa = Fasta.open(file_fasta, 'w')
end
end
end
out_fa.close if out_fa.respond_to? :close
end
-patscan = PatScan.new(options, tmpdir, file_pattern, options[:cpus])
-patscan.run
-matches = patscan.parse_results
+patscan = PatScan.new(tmpdir, options[:cpus])
+matches_left = {}
+matches_right = {}
-number_seq = 0
+if options[:forward]
+ pat_left = Pattern.create_left(
+ options[:forward],
+ options[:mismatches],
+ options[:insertions],
+ options[:deletions],
+ options[:len_forward])
+
+ patscan.run(pat_left)
+ matches_left = patscan.parse_results
+end
+
+if options[:reverse]
+ patscan.clean
+ pat_right = Pattern.create_right(
+ options[:reverse],
+ options[:mismatches],
+ options[:insertions],
+ options[:deletions],
+ options[:len_reverse])
+
+ patscan.run(pat_right)
+ matches_right = patscan.parse_results
+end
+
+count_seq = 0
Biopieces.open(file_records, options[:stream_out]) do |input, output|
input.each_record do |record|
- if record.has_key? :SEQ
- if matches.has_key? number_seq
- record[:ADAPTOR_POS] = matches[number_seq].first
- record[:ADAPTOR_LEN] = matches[number_seq].last
+ if record[:SEQ]
+ if matches_left[count_seq]
+ record[:ADAPTOR_POS_LEFT] = matches_left[count_seq].first
+ record[:ADAPTOR_LEN_LEFT] = matches_left[count_seq].last
+ record[:ADAPTOR_PAT_LEFT] = record[:SEQ][record[:ADAPTOR_POS_LEFT] ... record[:ADAPTOR_POS_LEFT] + record[:ADAPTOR_LEN_LEFT]]
+ end
+
+ if matches_right[count_seq]
+ record[:ADAPTOR_POS_RIGHT] = matches_right[count_seq].first
+ record[:ADAPTOR_LEN_RIGHT] = matches_right[count_seq].last
+ record[:ADAPTOR_PAT_RIGHT] = record[:SEQ][record[:ADAPTOR_POS_RIGHT] ... record[:ADAPTOR_POS_RIGHT] + record[:ADAPTOR_LEN_RIGHT]]
end
- number_seq += 1;
+ count_seq += 1;
end
output.puts record