#!/usr/bin/env ruby # Copyright (C) 2007-2011 Martin A. Hansen. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # http://www.gnu.org/copyleft/gpl.html # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # This program is part of the Biopieces framework (www.biopieces.org). # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # Remove adaptors or parts thereof from sequences in the stream. # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< require 'pp' require 'maasha/biopieces' require 'maasha/fasta' # Error class for PatScan errors. class PatScanError < StandardError; end; class PatScan def initialize(options, tmpdir, file_pattern, cpus) @options = options @tmpdir = tmpdir @file_pattern = file_pattern @cpus = cpus @files_fasta = Dir.glob(File.join(@tmpdir, "*.fna")) pat = Pattern.new(@options) pat.write(@file_pattern) end def run child_count = 0 ch_mutex = Mutex.new threads = [] @files_fasta.each do |file| Thread.pass while child_count >= @cpus ch_mutex.synchronize { child_count += 1 } threads << Thread.new do command = command_compile(file) system(command) raise PatScanError, "Command failed: #{command}" unless $?.success? ch_mutex.synchronize { child_count -= 1 } end end threads.each { |t| t.join } end def command_compile(file) commands = [] commands << "nice -n 19" commands << "scan_for_matches" commands << @file_pattern commands << "< #{file}" commands << "> #{file}.out" command = commands.join(" ") end def parse_results files_result = Dir.glob(File.join(@tmpdir, "*.out")) matches = {} files_result.each do |file| Fasta.open(file, 'r') do |ios| ios.each do |entry| if entry.seq_name =~ /^(\d+):\[(\d+),(\d+)\]$/ name = $1.to_i start = $2.to_i - 1 stop = $3.to_i - 1 matches[name] = [start, stop - start + 1] unless matches.has_key? name else raise "Failed to parse sequence name: #{entry.seq_name}" end end end end matches end end # Error class for Pattern errors. class PatternError < StandardError; end; class Pattern def initialize(options) @options = options @patterns = [] @patterns << pattern_internal @patterns += patterns_end if @options[:partial] end def to_i new_patterns = [] while @patterns.size > 1 new_patterns = @patterns[0 ... -2] new_patterns << "( #{@patterns[-2 .. -1].join(' | ')} )" @patterns = new_patterns end @patterns.first end def write(file) File.open(file, 'w') do |ios| ios.puts self.to_i end end private def pattern_internal pattern = @options[:adaptor] mis = mis_count(pattern) ins = ins_count(pattern) del = del_count(pattern) "#{pattern}[#{mis},#{ins},#{del}]" end def patterns_end patterns = [] adaptor = @options[:adaptor] raise PatternError, "len > adaptor length: #{@options[:len]} > #{adaptor.length - 1}" if @options[:len] > adaptor.length - 1 (adaptor.length - 1).downto(@options[:len]) do |i| pattern = adaptor[0 ... i] mis = mis_count(pattern) ins = ins_count(pattern) del = del_count(pattern) patterns << "#{pattern}[#{mis},#{ins},#{del}] $" end patterns end def mis_count(pattern) (pattern.length * @options[:mismatches] * 0.01).round end def ins_count(pattern) (pattern.length * @options[:insertions] * 0.01).round end def del_count(pattern) (pattern.length * @options[:deletions] * 0.01).round end end casts = [] casts << {:long=>'adaptor', :short=>'a', :type=>'string', :mandatory=>true, :default=>nil, :allowed=>nil, :disallowed=>nil} casts << {:long=>'partial', :short=>'p', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} casts << {:long=>'len', :short=>'l', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>'0'} casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil} casts << {:long=>'insertions', :short=>'i', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} casts << {:long=>'deletions', :short=>'d', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} casts << {:long=>'cpus', :short=>'c', :type=>'uint', :mandatory=>false, :default=>1, :allowed=>nil, :disallowed=>'0'} BASE_PER_FILE = 10_000_000 options = Biopieces.options_parse(ARGV, casts) tmpdir = Biopieces.mktmpdir file_records = File.join(tmpdir, "data.stream") file_pattern = File.join(tmpdir, "pattern.txt") number_file = 0 number_seq = 0 bases = 0 Biopieces.open(options[:stream_in], file_records) do |input, output| file_fasta = File.join(tmpdir, "#{number_file}.fna") out_fa = Fasta.open(file_fasta, 'w') input.each do |record| output.puts record if record[:SEQ] and record[:SEQ].length > 0 record[:SEQ_NAME] = number_seq seq = Seq.new_bp(record) out_fa.puts seq.to_fasta number_seq += 1; bases += record[:SEQ].length if bases > BASE_PER_FILE out_fa.close bases = 0 number_file += 1 file_fasta = File.join(tmpdir, "#{number_file}.fna") out_fa = Fasta.open(file_fasta, 'w') end end end out_fa.close if out_fa.respond_to? :close end patscan = PatScan.new(options, tmpdir, file_pattern, options[:cpus]) patscan.run matches = patscan.parse_results number_seq = 0 Biopieces.open(file_records, options[:stream_out]) do |input, output| input.each_record do |record| if record.has_key? :SEQ if matches.has_key? number_seq record[:ADAPTOR_POS] = matches[number_seq].first record[:ADAPTOR_LEN] = matches[number_seq].last end number_seq += 1; end output.puts record end end # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< __END__