#!/usr/bin/env ruby # Copyright (C) 2007-2011 Martin A. Hansen. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # http://www.gnu.org/copyleft/gpl.html # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # This program is part of the Biopieces framework (www.biopieces.org). # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # Remove adaptors or parts thereof from sequences in the stream. # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< require 'pp' require 'maasha/biopieces' require 'maasha/fasta' # Error class for PatScan errors. class PatScanError < StandardError; end; class PatScan def initialize(options, file_fasta, file_pattern, file_patscan) @options = options @file_fasta = file_fasta @file_pattern = file_pattern @file_patscan = file_patscan pat = Pattern.new(@options) pat.write(@file_pattern) end def run commands = [] commands << "nice -n 19" commands << "scan_for_matches" commands << @file_pattern commands << "< #{@file_fasta}" commands << "> #{@file_patscan}" command = commands.join(" ") system(command) raise PatScanError, "Command failed: #{command}" unless $?.success? end def parse_results matches = {} Fasta.open(@file_patscan, mode='r') do |ios| ios.each_entry do |entry| if entry.seq_name =~ /^(\d+):\[(\d+),(\d+)\]$/ name = $1.to_i start = $2.to_i - 1 stop = $3.to_i - 1 matches[name] = [start, stop - start + 1] unless matches.has_key? name else raise "Failed to parse sequence name: #{entry.seq_name}" end end end matches end end # Error class for Pattern errors. class PatternError < StandardError; end; class Pattern def initialize(options) @options = options @patterns = [] @patterns << pattern_internal @patterns += patterns_end if @options[:partial] end def to_i new_patterns = [] while @patterns.size > 1 new_patterns = @patterns[0 ... -2] new_patterns << "( #{@patterns[-2 .. -1].join(' | ')} )" @patterns = new_patterns end @patterns.first end def write(file) File.open(file, mode='w') do |ios| ios.puts self.to_i end end private def pattern_internal pattern = @options[:adaptor] mis = mis_count(pattern) ins = ins_count(pattern) del = del_count(pattern) "#{pattern}[#{mis},#{ins},#{del}]" end def patterns_end patterns = [] adaptor = @options[:adaptor] raise PatternError, "len > adaptor length: #{@options[:len]} > #{adaptor.length - 1}" if @options[:len] > adaptor.length - 1 (adaptor.length - 1).downto(@options[:len]) do |i| pattern = adaptor[0 ... i] mis = mis_count(pattern) ins = ins_count(pattern) del = del_count(pattern) patterns << "#{pattern}[#{mis},#{ins},#{del}] $" end patterns end def mis_count(pattern) (pattern.length * @options[:mismatches] * 0.01).round end def ins_count(pattern) (pattern.length * @options[:insertions] * 0.01).round end def del_count(pattern) (pattern.length * @options[:deletions] * 0.01).round end end casts = [] casts << {:long=>'adaptor', :short=>'a', :type=>'string', :mandatory=>true, :default=>nil, :allowed=>nil, :disallowed=>nil} casts << {:long=>'partial', :short=>'p', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} casts << {:long=>'len', :short=>'l', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>'0'} casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil} casts << {:long=>'insertions', :short=>'i', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} casts << {:long=>'deletions', :short=>'d', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} options = Biopieces.options_parse(ARGV, casts) tmpdir = Biopieces.mktmpdir file_fasta = File.join(tmpdir, "data.fna") file_records = File.join(tmpdir, "data.stream") file_pattern = File.join(tmpdir, "pattern.txt") file_patscan = File.join(tmpdir, "patscan.fna") count = 0 Biopieces.open(options[:stream_in], file_records) do |input, output| Fasta.open(file_fasta, mode='w') do |out_fa| input.each do |record| output.puts record if record.has_key? :SEQ record[:SEQ_NAME] = count out_fa.puts record count += 1; end end end end patscan = PatScan.new(options, file_fasta, file_pattern, file_patscan) patscan.run matches = patscan.parse_results count = 0 Biopieces.open(file_records, options[:stream_out]) do |input, output| input.each_record do |record| if record.has_key? :SEQ if matches.has_key? count record[:ADAPTOR_POS] = matches[count].first record[:ADAPTOR_LEN] = matches[count].last end count += 1; end output.puts record end end # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< __END__