# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-
require 'pp'
require 'maasha/biopieces'
-require 'maasha/fasta'
-
-# Error class for PatScan errors.
-class PatScanError < StandardError; end;
-
-class PatScan
- def initialize(tmpdir, cpus)
- @tmpdir = tmpdir
- @cpus = cpus
- @files_fasta = Dir.glob(File.join(@tmpdir, "*.fna"))
- end
-
- def run(pattern)
- file_pat = File.join(@tmpdir, "pat.txt")
-
- File.open(file_pat, 'w') do |ios|
- ios.puts pattern
- end
-
- child_count = 0
- ch_mutex = Mutex.new
- threads = []
-
- @files_fasta.each do |file_fasta|
- Thread.pass while child_count >= @cpus
- ch_mutex.synchronize { child_count += 1 }
-
- threads << Thread.new do
- command = command_compile(file_pat, file_fasta)
- system(command)
- raise PatScanError, "Command failed: #{command}" unless $?.success?
- ch_mutex.synchronize { child_count -= 1 }
- end
- end
-
- threads.each { |t| t.join }
- end
-
- def parse_results
- files_result = Dir.glob(File.join(@tmpdir, "*.out"))
-
- matches = {}
-
- files_result.each do |file|
- Fasta.open(file, 'r') do |ios|
- ios.each do |entry|
- if entry.seq_name =~ /^(\d+):\[(\d+),(\d+)\]$/
- name = $1.to_i
- start = $2.to_i - 1
- stop = $3.to_i - 1
- matches[name] = [start, stop - start + 1] unless matches[name]
- else
- raise "Failed to parse sequence name: #{entry.seq_name}"
- end
- end
- end
- end
-
- matches
- end
+require 'maasha/seq'
+require 'maasha/seq/backtrack'
- def clean
- Dir.glob(File.join(@tmpdir, "*.out")).each do |file|
- File.unlink(file)
- end
- end
-
- private
-
- def command_compile(file_pat, file_fasta)
- commands = []
- commands << "nice -n 19"
- commands << "scan_for_matches"
- commands << file_pat
- commands << "< #{file_fasta}"
- commands << "> #{file_fasta}.out"
- command = commands.join(" ")
- end
+def percent2real(length, percent)
+ (length * percent * 0.01).round
end
-# Error class for Pattern errors.
-class PatternError < StandardError; end;
-
-class Pattern
- def self.create_left(adaptor, misp, insp, delp, len)
- patterns = []
-
- (adaptor.length).downto(len) do |i|
- pattern = adaptor[adaptor.length - i ... adaptor.length]
- mis = (pattern.length * misp * 0.01).round
- ins = (pattern.length * insp * 0.01).round
- del = (pattern.length * delp * 0.01).round
-
- if i == adaptor.length
- patterns << "#{pattern}[#{mis},#{ins},#{del}]"
- else
- patterns << "^ #{pattern}[#{mis},#{ins},#{del}]"
- end
- end
-
- compile(patterns)
- end
-
- def self.create_right(adaptor, misp, insp, delp, len)
- patterns = []
-
- (adaptor.length).downto(len) do |i|
- pattern = adaptor[0 ... i]
- mis = (pattern.length * misp * 0.01).round
- ins = (pattern.length * insp * 0.01).round
- del = (pattern.length * delp * 0.01).round
-
- if i == adaptor.length
- patterns << "#{pattern}[#{mis},#{ins},#{del}]"
- else
- patterns << "#{pattern}[#{mis},#{ins},#{del}] $"
- end
- end
-
- compile(patterns)
- end
-
- private
-
- def self.compile(patterns)
- new_patterns = []
-
- while patterns.size > 1
- new_patterns = patterns[0 ... -2]
- new_patterns << "( #{patterns[-2 .. -1].join(' | ')} )"
-
- patterns = new_patterns
- end
-
- patterns.first
- end
-end
+include BackTrack
casts = []
-casts << {:long=>'forward', :short=>'f', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'forward_rc', :short=>'F', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'reverse', :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'reverse_rc', :short=>'R', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'len_forward', :short=>'l', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'}
-casts << {:long=>'len_reverse', :short=>'L', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'}
-casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'insertions', :short=>'i', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'deletions', :short=>'d', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'cpus', :short=>'c', :type=>'uint', :mandatory=>false, :default=>1, :allowed=>nil, :disallowed=>'0'}
-casts << {:long=>'bases_per_file',:short=>'b', :type=>'uint', :mandatory=>false, :default=>10_000_000, :allowed=>nil, :disallowed=>'0'}
+casts << {:long=>'forward', :short=>'f', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'forward_rc', :short=>'F', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'reverse', :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'reverse_rc', :short=>'R', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'len_forward', :short=>'l', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'}
+casts << {:long=>'len_reverse', :short=>'L', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'}
+casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'insertions', :short=>'i', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'deletions', :short=>'d', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil}
options = Biopieces.options_parse(ARGV, casts)
if options[:forward_rc]
- options[:forward] = Seq.new("test", options[:forward_rc], 'dna').revcomp.seq
+ options[:forward] = Seq.new(seq: options[:forward_rc], type: :dna).reverse.complement.seq
end
if options[:reverse_rc]
- options[:reverse] = Seq.new("test", options[:reverse_rc], 'dna').revcomp.seq
+ options[:reverse] = Seq.new(seq: options[:reverse_rc], type: :dna).reverse.complement.seq
end
raise ArgumentError, "no adaptor specified" unless options[:forward] or options[:reverse]
if options[:len_forward] > options[:forward].length
raise ArgumentError, "len_forward > forward adaptor (#{options[:len_forward]} > #{options[:forward].length})"
end
+
+ fmis = percent2real(options[:forward].length, options[:mismatches])
+ fins = percent2real(options[:forward].length, options[:insertions])
+ fdel = percent2real(options[:forward].length, options[:deletions])
end
if options[:reverse]
if options[:len_reverse] > options[:reverse].length
raise ArgumentError, "len_reverse > reverse adaptor (#{options[:len_reverse]} > #{options[:reverse].length})"
end
-end
-tmpdir = Biopieces.mktmpdir
-file_records = File.join(tmpdir, "data.stream")
-
-count_file = 0
-count_seq = 0
-bases = 0
-
-Biopieces.open(options[:stream_in], file_records) do |input, output|
- file_fasta = File.join(tmpdir, "#{count_file}.fna")
- out_fa = Fasta.open(file_fasta, 'w')
+ rmis = percent2real(options[:reverse].length, options[:mismatches])
+ rins = percent2real(options[:reverse].length, options[:insertions])
+ rdel = percent2real(options[:reverse].length, options[:deletions])
+end
+Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
input.each do |record|
- output.puts record
+ if record[:SEQ]
+ entry = Seq.new_bp(record)
- if record[:SEQ] and record[:SEQ].length > 0
- record[:SEQ_NAME] = count_seq
+ if options[:forward] and record[:SEQ].length >= options[:forward].length
+ if m = entry.patmatch(options[:forward], max_mismatches: fmis, max_insertions: fins, max_deletions: fdel)
+ record[:ADAPTOR_POS_LEFT] = m.pos
+ record[:ADAPTOR_LEN_LEFT] = m.length
+ record[:ADAPTOR_PAT_LEFT] = m.match
+ elsif options[:len_forward] < options[:forward].length
+ len = options[:forward].length - 1
+ pat = options[:forward]
- seq = Seq.new_bp(record)
+ while len >= options[:len_forward]
+ fmis = percent2real(len, options[:mismatches])
+ fins = percent2real(len, options[:insertions])
+ fdel = percent2real(len, options[:deletions])
- out_fa.puts seq.to_fasta
+ pat = pat[1 ... pat.length]
- count_seq += 1;
- bases += record[:SEQ].length
+ if m = entry.patmatch(pat, start: 0, stop: len, max_mismatches: fmis, max_insertions: fins, max_deletions: fdel)
+ record[:ADAPTOR_POS_LEFT] = m.pos
+ record[:ADAPTOR_LEN_LEFT] = m.length
+ record[:ADAPTOR_PAT_LEFT] = m.match
- if bases > options[:bases_per_file]
- out_fa.close
- bases = 0
- count_file += 1
- file_fasta = File.join(tmpdir, "#{count_file}.fna")
- out_fa = Fasta.open(file_fasta, 'w')
- end
- end
- end
+ break
+ end
- out_fa.close if out_fa.respond_to? :close
-end
+ len -= 1
+ end
+ end
+ end
-patscan = PatScan.new(tmpdir, options[:cpus])
-matches_left = {}
-matches_right = {}
+ if options[:reverse] and record[:SEQ].length >= options[:reverse].length
+ if m = entry.patmatch(options[:reverse], max_mismatches: rmis, max_insertions: rins, max_deletions: rdel)
+ record[:ADAPTOR_POS_RIGHT] = m.pos
+ record[:ADAPTOR_LEN_RIGHT] = m.length
+ record[:ADAPTOR_PAT_RIGHT] = m.match
+ elsif options[:len_reverse] < options[:reverse].length
+ len = options[:reverse].length - 1
+ pat = options[:reverse]
-if options[:forward]
- pat_left = Pattern.create_left(
- options[:forward],
- options[:mismatches],
- options[:insertions],
- options[:deletions],
- options[:len_forward])
-
- patscan.run(pat_left)
- matches_left = patscan.parse_results
-end
+ while len >= options[:len_reverse]
+ rmis = percent2real(len, options[:mismatches])
+ rins = percent2real(len, options[:insertions])
+ rdel = percent2real(len, options[:deletions])
-if options[:reverse]
- patscan.clean
- pat_right = Pattern.create_right(
- options[:reverse],
- options[:mismatches],
- options[:insertions],
- options[:deletions],
- options[:len_reverse])
-
- patscan.run(pat_right)
- matches_right = patscan.parse_results
-end
+ pat = pat[0 ... pat.length - 1]
-count_seq = 0
+ if m = entry.patmatch(pat, start: entry.length - len, max_mismatches: rmis, max_insertions: rins, max_deletions: rdel)
+ record[:ADAPTOR_POS_RIGHT] = m.pos
+ record[:ADAPTOR_LEN_RIGHT] = m.length
+ record[:ADAPTOR_PAT_RIGHT] = m.match
-Biopieces.open(file_records, options[:stream_out]) do |input, output|
- input.each_record do |record|
- if record[:SEQ]
- if matches_left[count_seq]
- record[:ADAPTOR_POS_LEFT] = matches_left[count_seq].first
- record[:ADAPTOR_LEN_LEFT] = matches_left[count_seq].last
- record[:ADAPTOR_PAT_LEFT] = record[:SEQ][record[:ADAPTOR_POS_LEFT] ... record[:ADAPTOR_POS_LEFT] + record[:ADAPTOR_LEN_LEFT]]
- end
+ break
+ end
- if matches_right[count_seq]
- record[:ADAPTOR_POS_RIGHT] = matches_right[count_seq].first
- record[:ADAPTOR_LEN_RIGHT] = matches_right[count_seq].last
- record[:ADAPTOR_PAT_RIGHT] = record[:SEQ][record[:ADAPTOR_POS_RIGHT] ... record[:ADAPTOR_POS_RIGHT] + record[:ADAPTOR_LEN_RIGHT]]
+ len -= 1
+ end
+ end
end
-
- count_seq += 1;
end
output.puts record
end
end
-
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<