#!/usr/bin/env ruby
-# Copyright (C) 2007-2011 Martin A. Hansen.
+# Copyright (C) 2007-2012 Martin A. Hansen.
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+require 'pp'
require 'maasha/biopieces'
-require 'maasha/seq'
-
-def disambiguate(adaptor)
- adaptor_disamb = adaptor.dup
- adaptor_disamb.gsub!('U', 'T')
- adaptor_disamb.gsub!('R', '[AG]')
- adaptor_disamb.gsub!('Y', '[CT]')
- adaptor_disamb.gsub!('S', '[GC]')
- adaptor_disamb.gsub!('W', '[AT]')
- adaptor_disamb.gsub!('M', '[AC]')
- adaptor_disamb.gsub!('K', '[GT]')
- adaptor_disamb.gsub!('V', '[ACG]')
- adaptor_disamb.gsub!('H', '[ACT]')
- adaptor_disamb.gsub!('D', '[AGT]')
- adaptor_disamb.gsub!('B', '[CGT]')
- adaptor_disamb.gsub!('N', '.')
- adaptor_disamb
-end
+require 'maasha/fasta'
+
+# Error class for PatScan errors.
+class PatScanError < StandardError; end;
+
+class PatScan
+ def initialize(tmpdir, cpus)
+ @tmpdir = tmpdir
+ @cpus = cpus
+ @files_fasta = Dir.glob(File.join(@tmpdir, "*.fna"))
+ end
-class Seq
- # Method that finds an adaptor or part thereof in the sequence of a Seq object.
- # Returns a Match object if the adaptor was found otherwise nil. The ed_percent
- # indicates the maximum edit distance allowed in all possible overlaps.
- def adaptor_find(adaptor, adaptor_disamb, pos = 0, ed_percent = 0, dist = 0)
- raise SeqError, "Edit distance percent out of range #{ed_percent}" unless (0 .. 100).include? ed_percent
+ def run(pattern)
+ file_pat = File.join(@tmpdir, "pat.txt")
- if pos < 0
- pos = self.length + pos # pos offset from the right end
+ File.open(file_pat, 'w') do |ios|
+ ios.puts pattern
end
- if pos < self.length - dist + 1
- if match = adaptor_find_simple(adaptor_disamb, pos)
- return match
- elsif match = adaptor_find_complex(adaptor, pos, ed_percent)
- return match
- elsif match = adaptor_partial_find_complex(adaptor, pos, ed_percent, dist)
- return match
+ child_count = 0
+ ch_mutex = Mutex.new
+ threads = []
+
+ @files_fasta.each do |file_fasta|
+ Thread.pass while child_count >= @cpus
+ ch_mutex.synchronize { child_count += 1 }
+
+ threads << Thread.new do
+ command = command_compile(file_pat, file_fasta)
+ system(command)
+ raise PatScanError, "Command failed: #{command}" unless $?.success?
+ ch_mutex.synchronize { child_count -= 1 }
end
end
- end
- private
+ threads.each { |t| t.join }
+ end
- # Method to find an adaptor in a sequence taking into account ambiguity
- # codes, but not considering mismatches, insertions, and deletions.
- def adaptor_find_simple(adaptor, pos)
- self.seq.upcase.match(adaptor, pos) do |m|
- return Match.new($`.length, m, m.to_s.length, 0, 0, 0, m.to_s.length)
+ def parse_results
+ files_result = Dir.glob(File.join(@tmpdir, "*.out"))
+
+ matches = {}
+
+ files_result.each do |file|
+ Fasta.open(file, 'r') do |ios|
+ ios.each do |entry|
+ if entry.seq_name =~ /^(\d+):\[(\d+),(\d+)\]$/
+ name = $1.to_i
+ start = $2.to_i - 1
+ stop = $3.to_i - 1
+ matches[name] = [start, stop - start + 1] unless matches[name]
+ else
+ raise "Failed to parse sequence name: #{entry.seq_name}"
+ end
+ end
+ end
end
+
+ matches
end
- # Method to find an adaptor in a sequence taking into account ambiguity
- # codes, mismatches, insertions, and deletions.
- def adaptor_find_complex(adaptor, pos, ed_percent)
- ed_max = (adaptor.length * ed_percent * 0.01).round
+ def clean
+ Dir.glob(File.join(@tmpdir, "*.out")).each do |file|
+ File.unlink(file)
+ end
+ end
- match = self.match(adaptor, pos, ed_max)
+ private
- match
+ def command_compile(file_pat, file_fasta)
+ commands = []
+ commands << "nice -n 19"
+ commands << "scan_for_matches"
+ commands << file_pat
+ commands << "< #{file_fasta}"
+ commands << "> #{file_fasta}.out"
+ command = commands.join(" ")
end
+end
- # Method to find part of an adaptor at the right end of a sequence taking
- # into account ambiguity codes, mismatches, insertions, and deletions.
- def adaptor_partial_find_complex(adaptor, pos, ed_percent, dist)
- if pos > self.length - adaptor.length
- adaptor = adaptor[0 ... self.length - pos]
- else
- adaptor = adaptor[0 ... adaptor.length]
+# Error class for Pattern errors.
+class PatternError < StandardError; end;
- pos = self.length - adaptor.length
+class Pattern
+ def self.create_left(adaptor, misp, insp, delp, len)
+ patterns = []
+
+ (adaptor.length).downto(len) do |i|
+ pattern = adaptor[adaptor.length - i ... adaptor.length]
+ mis = (pattern.length * misp * 0.01).round
+ ins = (pattern.length * insp * 0.01).round
+ del = (pattern.length * delp * 0.01).round
+
+ if i == adaptor.length
+ patterns << "#{pattern}[#{mis},#{ins},#{del}]"
+ else
+ patterns << "^ #{pattern}[#{mis},#{ins},#{del}]"
+ end
end
- # puts self.seq
+ compile(patterns)
+ end
- while adaptor.length > 0 and adaptor.length >= dist
- # puts (" " * pos) + adaptor
+ def self.create_right(adaptor, misp, insp, delp, len)
+ patterns = []
- ed_max = (adaptor.length * ed_percent * 0.01).round
+ (adaptor.length).downto(len) do |i|
+ pattern = adaptor[0 ... i]
+ mis = (pattern.length * misp * 0.01).round
+ ins = (pattern.length * insp * 0.01).round
+ del = (pattern.length * delp * 0.01).round
- if ed_max == 0
- self.seq.upcase.match(adaptor, pos) do |m|
- return Match.new($`.length, m, m.to_s.length, 0, 0, 0, m.to_s.length)
- end
+ if i == adaptor.length
+ patterns << "#{pattern}[#{mis},#{ins},#{del}]"
else
- self.scan(adaptor, pos, ed_max).each do |match|
- return match
- end
+ patterns << "#{pattern}[#{mis},#{ins},#{del}] $"
end
+ end
- adaptor = adaptor[0 ... -1]
+ compile(patterns)
+ end
+
+ private
+
+ def self.compile(patterns)
+ new_patterns = []
- pos += 1
+ while patterns.size > 1
+ new_patterns = patterns[0 ... -2]
+ new_patterns << "( #{patterns[-2 .. -1].join(' | ')} )"
+
+ patterns = new_patterns
end
+
+ patterns.first
end
end
casts = []
-casts << {:long=>'adaptor', :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'edit_distance', :short=>'e', :type=>'uint', :mandatory=>false, :default=>20, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'pos', :short=>'p', :type=>'int', :mandatory=>false, :default=>1, :allowed=>nil, :disallowed=>"0"}
-casts << {:long=>'dist', :short=>'d', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'cache', :short=>'c', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'forward', :short=>'f', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'forward_rc', :short=>'F', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'reverse', :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'reverse_rc', :short=>'R', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'len_forward', :short=>'l', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'}
+casts << {:long=>'len_reverse', :short=>'L', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'}
+casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'insertions', :short=>'i', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'deletions', :short=>'d', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'cpus', :short=>'c', :type=>'uint', :mandatory=>false, :default=>1, :allowed=>nil, :disallowed=>'0'}
+casts << {:long=>'bases_per_file',:short=>'b', :type=>'uint', :mandatory=>false, :default=>10_000_000, :allowed=>nil, :disallowed=>'0'}
options = Biopieces.options_parse(ARGV, casts)
-adaptor = options[:adaptor].to_s.upcase
-adaptor_disamb = disambiguate(adaptor)
+if options[:forward_rc]
+ options[:forward] = Seq.new("test", options[:forward_rc], 'dna').revcomp.seq
+end
+
+if options[:reverse_rc]
+ options[:reverse] = Seq.new("test", options[:reverse_rc], 'dna').revcomp.seq
+end
-pos = options[:pos]
-pos -= 1 if pos > 0 # pos was 1-based
+raise ArgumentError, "no adaptor specified" unless options[:forward] or options[:reverse]
-cache = {}
+if options[:forward]
+ options[:len_forward] = options[:forward].length unless options[:len_forward]
-Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
- input.each_record do |record|
- if record.has_key? :SEQ
- entry = Seq.new(record[:SEQ_NAME], record[:SEQ], "dna", record[:SCORES])
+ if options[:len_forward] > options[:forward].length
+ raise ArgumentError, "len_forward > forward adaptor (#{options[:len_forward]} > #{options[:forward].length})"
+ end
+end
- if cache[entry.seq.upcase.to_sym] and options[:cache]
- match = cache[entry.seq.upcase]
- else
- match = entry.adaptor_find(adaptor, adaptor_disamb, pos, options[:edit_distance], options[:dist])
+if options[:reverse]
+ options[:len_reverse] = options[:reverse].length unless options[:len_reverse]
+
+ if options[:len_reverse] > options[:reverse].length
+ raise ArgumentError, "len_reverse > reverse adaptor (#{options[:len_reverse]} > #{options[:reverse].length})"
+ end
+end
+
+tmpdir = Biopieces.mktmpdir
+file_records = File.join(tmpdir, "data.stream")
+
+count_file = 0
+count_seq = 0
+bases = 0
+
+Biopieces.open(options[:stream_in], file_records) do |input, output|
+ file_fasta = File.join(tmpdir, "#{count_file}.fna")
+ out_fa = Fasta.open(file_fasta, 'w')
+
+ input.each do |record|
+ output.puts record
+
+ if record[:SEQ] and record[:SEQ].length > 0
+ record[:SEQ_NAME] = count_seq
+
+ seq = Seq.new_bp(record)
+
+ out_fa.puts seq.to_fasta
- cache[entry.seq.upcase.to_sym] = match if match and options[:cache]
+ count_seq += 1;
+ bases += record[:SEQ].length
+
+ if bases > options[:bases_per_file]
+ out_fa.close
+ bases = 0
+ count_file += 1
+ file_fasta = File.join(tmpdir, "#{count_file}.fna")
+ out_fa = Fasta.open(file_fasta, 'w')
+ end
+ end
+ end
+
+ out_fa.close if out_fa.respond_to? :close
+end
+
+patscan = PatScan.new(tmpdir, options[:cpus])
+matches_left = {}
+matches_right = {}
+
+if options[:forward]
+ pat_left = Pattern.create_left(
+ options[:forward],
+ options[:mismatches],
+ options[:insertions],
+ options[:deletions],
+ options[:len_forward])
+
+ patscan.run(pat_left)
+ matches_left = patscan.parse_results
+end
+
+if options[:reverse]
+ patscan.clean
+ pat_right = Pattern.create_right(
+ options[:reverse],
+ options[:mismatches],
+ options[:insertions],
+ options[:deletions],
+ options[:len_reverse])
+
+ patscan.run(pat_right)
+ matches_right = patscan.parse_results
+end
+
+count_seq = 0
+
+Biopieces.open(file_records, options[:stream_out]) do |input, output|
+ input.each_record do |record|
+ if record[:SEQ]
+ if matches_left[count_seq]
+ record[:ADAPTOR_POS_LEFT] = matches_left[count_seq].first
+ record[:ADAPTOR_LEN_LEFT] = matches_left[count_seq].last
+ record[:ADAPTOR_PAT_LEFT] = record[:SEQ][record[:ADAPTOR_POS_LEFT] ... record[:ADAPTOR_POS_LEFT] + record[:ADAPTOR_LEN_LEFT]]
end
- if match
- record[:ADAPTOR_POS] = match.pos
- record[:ADAPTOR_LEN] = match.length
- record[:ADAPTOR_MATCH] = match.match
+ if matches_right[count_seq]
+ record[:ADAPTOR_POS_RIGHT] = matches_right[count_seq].first
+ record[:ADAPTOR_LEN_RIGHT] = matches_right[count_seq].last
+ record[:ADAPTOR_PAT_RIGHT] = record[:SEQ][record[:ADAPTOR_POS_RIGHT] ... record[:ADAPTOR_POS_RIGHT] + record[:ADAPTOR_LEN_RIGHT]]
end
+
+ count_seq += 1;
end
output.puts record