#!/usr/bin/env ruby # Copyright (C) 2007-2012 Martin A. Hansen. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # http://www.gnu.org/copyleft/gpl.html # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # This program is part of the Biopieces framework (www.biopieces.org). # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # Remove adaptors or parts thereof from sequences in the stream. # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< require 'pp' require 'maasha/biopieces' require 'maasha/fasta' # Error class for PatScan errors. class PatScanError < StandardError; end; class PatScan def initialize(tmpdir, cpus) @tmpdir = tmpdir @cpus = cpus @files_fasta = Dir.glob(File.join(@tmpdir, "*.fna")) end def run(pattern) file_pat = File.join(@tmpdir, "pat.txt") File.open(file_pat, 'w') do |ios| ios.puts pattern end child_count = 0 ch_mutex = Mutex.new threads = [] @files_fasta.each do |file_fasta| Thread.pass while child_count >= @cpus ch_mutex.synchronize { child_count += 1 } threads << Thread.new do command = command_compile(file_pat, file_fasta) system(command) raise PatScanError, "Command failed: #{command}" unless $?.success? ch_mutex.synchronize { child_count -= 1 } end end threads.each { |t| t.join } end def parse_results files_result = Dir.glob(File.join(@tmpdir, "*.out")) matches = {} files_result.each do |file| Fasta.open(file, 'r') do |ios| ios.each do |entry| if entry.seq_name =~ /^(\d+):\[(\d+),(\d+)\]$/ name = $1.to_i start = $2.to_i - 1 stop = $3.to_i - 1 matches[name] = [start, stop - start + 1] unless matches[name] else raise "Failed to parse sequence name: #{entry.seq_name}" end end end end matches end def clean Dir.glob(File.join(@tmpdir, "*.out")).each do |file| File.unlink(file) end end private def command_compile(file_pat, file_fasta) commands = [] commands << "nice -n 19" commands << "scan_for_matches" commands << file_pat commands << "< #{file_fasta}" commands << "> #{file_fasta}.out" command = commands.join(" ") end end # Error class for Pattern errors. class PatternError < StandardError; end; class Pattern def self.create_left(adaptor, misp, insp, delp, len) patterns = [] (adaptor.length).downto(len) do |i| pattern = adaptor[adaptor.length - i ... adaptor.length] mis = (pattern.length * misp * 0.01).round ins = (pattern.length * insp * 0.01).round del = (pattern.length * delp * 0.01).round if i == adaptor.length patterns << "#{pattern}[#{mis},#{ins},#{del}]" else patterns << "^ #{pattern}[#{mis},#{ins},#{del}]" end end compile(patterns) end def self.create_right(adaptor, misp, insp, delp, len) patterns = [] (adaptor.length).downto(len) do |i| pattern = adaptor[0 ... i] mis = (pattern.length * misp * 0.01).round ins = (pattern.length * insp * 0.01).round del = (pattern.length * delp * 0.01).round if i == adaptor.length patterns << "#{pattern}[#{mis},#{ins},#{del}]" else patterns << "#{pattern}[#{mis},#{ins},#{del}] $" end end compile(patterns) end private def self.compile(patterns) new_patterns = [] while patterns.size > 1 new_patterns = patterns[0 ... -2] new_patterns << "( #{patterns[-2 .. -1].join(' | ')} )" patterns = new_patterns end patterns.first end end casts = [] casts << {:long=>'forward', :short=>'f', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} casts << {:long=>'forward_rc', :short=>'F', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} casts << {:long=>'reverse', :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} casts << {:long=>'reverse_rc', :short=>'R', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} casts << {:long=>'len_forward', :short=>'l', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'} casts << {:long=>'len_reverse', :short=>'L', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'} casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil} casts << {:long=>'insertions', :short=>'i', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} casts << {:long=>'deletions', :short=>'d', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} casts << {:long=>'cpus', :short=>'c', :type=>'uint', :mandatory=>false, :default=>1, :allowed=>nil, :disallowed=>'0'} casts << {:long=>'bases_per_file',:short=>'b', :type=>'uint', :mandatory=>false, :default=>10_000_000, :allowed=>nil, :disallowed=>'0'} options = Biopieces.options_parse(ARGV, casts) if options[:forward_rc] options[:forward] = Seq.new("test", options[:forward_rc], 'dna').revcomp.seq end if options[:reverse_rc] options[:reverse] = Seq.new("test", options[:reverse_rc], 'dna').revcomp.seq end raise ArgumentError, "no adaptor specified" unless options[:forward] or options[:reverse] if options[:forward] options[:len_forward] = options[:forward].length unless options[:len_forward] if options[:len_forward] > options[:forward].length raise ArgumentError, "len_forward > forward adaptor (#{options[:len_forward]} > #{options[:forward].length})" end end if options[:reverse] options[:len_reverse] = options[:reverse].length unless options[:len_reverse] if options[:len_reverse] > options[:reverse].length raise ArgumentError, "len_reverse > reverse adaptor (#{options[:len_reverse]} > #{options[:reverse].length})" end end tmpdir = Biopieces.mktmpdir file_records = File.join(tmpdir, "data.stream") count_file = 0 count_seq = 0 bases = 0 Biopieces.open(options[:stream_in], file_records) do |input, output| file_fasta = File.join(tmpdir, "#{count_file}.fna") out_fa = Fasta.open(file_fasta, 'w') input.each do |record| output.puts record if record[:SEQ] and record[:SEQ].length > 0 record[:SEQ_NAME] = count_seq seq = Seq.new_bp(record) out_fa.puts seq.to_fasta count_seq += 1; bases += record[:SEQ].length if bases > options[:bases_per_file] out_fa.close bases = 0 count_file += 1 file_fasta = File.join(tmpdir, "#{count_file}.fna") out_fa = Fasta.open(file_fasta, 'w') end end end out_fa.close if out_fa.respond_to? :close end patscan = PatScan.new(tmpdir, options[:cpus]) matches_left = {} matches_right = {} if options[:forward] pat_left = Pattern.create_left( options[:forward], options[:mismatches], options[:insertions], options[:deletions], options[:len_forward]) patscan.run(pat_left) matches_left = patscan.parse_results end if options[:reverse] patscan.clean pat_right = Pattern.create_right( options[:reverse], options[:mismatches], options[:insertions], options[:deletions], options[:len_reverse]) patscan.run(pat_right) matches_right = patscan.parse_results end count_seq = 0 Biopieces.open(file_records, options[:stream_out]) do |input, output| input.each_record do |record| if record[:SEQ] if matches_left[count_seq] record[:ADAPTOR_POS_LEFT] = matches_left[count_seq].first record[:ADAPTOR_LEN_LEFT] = matches_left[count_seq].last record[:ADAPTOR_PAT_LEFT] = record[:SEQ][record[:ADAPTOR_POS_LEFT] ... record[:ADAPTOR_POS_LEFT] + record[:ADAPTOR_LEN_LEFT]] end if matches_right[count_seq] record[:ADAPTOR_POS_RIGHT] = matches_right[count_seq].first record[:ADAPTOR_LEN_RIGHT] = matches_right[count_seq].last record[:ADAPTOR_PAT_RIGHT] = record[:SEQ][record[:ADAPTOR_POS_RIGHT] ... record[:ADAPTOR_POS_RIGHT] + record[:ADAPTOR_LEN_RIGHT]] end count_seq += 1; end output.puts record end end # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< __END__