From 4b299e55753c044bb6f80094a1748bf94416d749 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Mon, 23 May 2011 15:07:31 +0000 Subject: [PATCH] fixed critical bug in biopieces.rb and reworked find_adaptor git-svn-id: http://biopieces.googlecode.com/svn/trunk@1415 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/find_adaptor | 227 +++++++++++++++++------------- code_ruby/lib/maasha/biopieces.rb | 24 ++-- 2 files changed, 144 insertions(+), 107 deletions(-) diff --git a/bp_bin/find_adaptor b/bp_bin/find_adaptor index 6319874..53e453a 100755 --- a/bp_bin/find_adaptor +++ b/bp_bin/find_adaptor @@ -29,138 +29,175 @@ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +require 'pp' require 'maasha/biopieces' -require 'maasha/seq' - -def disambiguate(adaptor) - adaptor_disamb = adaptor.dup - adaptor_disamb.gsub!('U', 'T') - adaptor_disamb.gsub!('R', '[AG]') - adaptor_disamb.gsub!('Y', '[CT]') - adaptor_disamb.gsub!('S', '[GC]') - adaptor_disamb.gsub!('W', '[AT]') - adaptor_disamb.gsub!('M', '[AC]') - adaptor_disamb.gsub!('K', '[GT]') - adaptor_disamb.gsub!('V', '[ACG]') - adaptor_disamb.gsub!('H', '[ACT]') - adaptor_disamb.gsub!('D', '[AGT]') - adaptor_disamb.gsub!('B', '[CGT]') - adaptor_disamb.gsub!('N', '.') - adaptor_disamb -end +require 'maasha/fasta' -class Seq - # Method that finds an adaptor or part thereof in the sequence of a Seq object. - # Returns a Match object if the adaptor was found otherwise nil. The ed_percent - # indicates the maximum edit distance allowed in all possible overlaps. - def adaptor_find(adaptor, adaptor_disamb, pos = 0, ed_percent = 0, dist = 0) - raise SeqError, "Edit distance percent out of range #{ed_percent}" unless (0 .. 100).include? ed_percent +# Error class for PatScan errors. +class PatScanError < StandardError; end; - if pos < 0 - pos = self.length + pos # pos offset from the right end - end +class PatScan + def initialize(options, file_fasta, file_pattern, file_patscan) + @options = options + @file_fasta = file_fasta + @file_pattern = file_pattern + @file_patscan = file_patscan + + pat = Pattern.new(@options) + pat.write(@file_pattern) + end + + def run + commands = [] + commands << "nice -n 19" + commands << "scan_for_matches" + commands << @file_pattern + commands << "< #{@file_fasta}" + commands << "> #{@file_patscan}" + command = commands.join(" ") + system(command) + raise PatScanError, "Command failed: #{command}" unless $?.success? + end - if pos < self.length - dist + 1 - if match = adaptor_find_simple(adaptor_disamb, pos) - return match - elsif match = adaptor_find_complex(adaptor, pos, ed_percent) - return match - elsif match = adaptor_partial_find_complex(adaptor, pos, ed_percent, dist) - return match + def parse_results + matches = {} + + Fasta.open(@file_patscan, mode='r') do |ios| + ios.each_entry do |entry| + if entry.seq_name =~ /^(\d+):\[(\d+),(\d+)\]$/ + name = $1.to_i + start = $2.to_i - 1 + stop = $3.to_i - 1 + matches[name] = [start, stop - start + 1] + else + raise "Failed to parse sequence name: #{entry.seq_name}" + end end end + + matches end +end - private +# Error class for Pattern errors. +class PatternError < StandardError; end; - # Method to find an adaptor in a sequence taking into account ambiguity - # codes, but not considering mismatches, insertions, and deletions. - def adaptor_find_simple(adaptor, pos) - self.seq.upcase.match(adaptor, pos) do |m| - return Match.new($`.length, m, m.to_s.length, 0, 0, 0, m.to_s.length) - end +class Pattern + def initialize(options) + @options = options + @patterns = [] + @patterns << pattern_internal + @patterns += patterns_end if @options[:trim_end] end - # Method to find an adaptor in a sequence taking into account ambiguity - # codes, mismatches, insertions, and deletions. - def adaptor_find_complex(adaptor, pos, ed_percent) - ed_max = (adaptor.length * ed_percent * 0.01).round + def to_i + new_patterns = [] - match = self.match(adaptor, pos, ed_max) + while @patterns.size > 1 + new_patterns = @patterns[0 ... -2] + new_patterns << "( #{@patterns[-2 .. -1].join(' | ')} )" - match - end + @patterns = new_patterns + end - # Method to find part of an adaptor at the right end of a sequence taking - # into account ambiguity codes, mismatches, insertions, and deletions. - def adaptor_partial_find_complex(adaptor, pos, ed_percent, dist) - if pos > self.length - adaptor.length - adaptor = adaptor[0 ... self.length - pos] - else - adaptor = adaptor[0 ... adaptor.length] + @patterns.first + end - pos = self.length - adaptor.length + def write(file) + File.open(file, mode='w') do |ios| + ios.puts self.to_i end + end - # puts self.seq + private - while adaptor.length > 0 and adaptor.length >= dist - # puts (" " * pos) + adaptor + def pattern_internal + pattern = @options[:adaptor] + mis = mis_count(pattern) + ins = ins_count(pattern) + del = del_count(pattern) - ed_max = (adaptor.length * ed_percent * 0.01).round + "#{pattern}[#{mis},#{ins},#{del}]" + end - if ed_max == 0 - self.seq.upcase.match(adaptor, pos) do |m| - return Match.new($`.length, m, m.to_s.length, 0, 0, 0, m.to_s.length) - end - else - self.scan(adaptor, pos, ed_max).each do |match| - return match - end - end + def patterns_end + patterns = [] + adaptor = @options[:adaptor] - adaptor = adaptor[0 ... -1] + raise PatternError, "trim_end_min > adaptor length: #{@options[:trim_end_min]} > #{adaptor.length - 1}" if @options[:trim_end_min] > adaptor.length - 1 - pos += 1 + (adaptor.length - 1).downto(@options[:trim_end_min]) do |i| + pattern = adaptor[0 ... i] + mis = mis_count(pattern) + ins = ins_count(pattern) + del = del_count(pattern) + patterns << "#{pattern}[#{mis},#{ins},#{del}] $" end + + patterns + end + + def mis_count(pattern) + (pattern.length * @options[:mismatches] * 0.01).round + end + + def ins_count(pattern) + (pattern.length * @options[:insertions] * 0.01).round + end + + def del_count(pattern) + (pattern.length * @options[:deletions] * 0.01).round end end casts = [] -casts << {:long=>'adaptor', :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'edit_distance', :short=>'e', :type=>'uint', :mandatory=>false, :default=>20, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'pos', :short=>'p', :type=>'int', :mandatory=>false, :default=>1, :allowed=>nil, :disallowed=>"0"} -casts << {:long=>'dist', :short=>'d', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'cache', :short=>'c', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'adaptor', :short=>'a', :type=>'string', :mandatory=>true, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'trim_end', :short=>'t', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'trim_end_min', :short=>'l', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>'0'} +casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'insertions', :short=>'i', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'deletions', :short=>'d', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>nil} options = Biopieces.options_parse(ARGV, casts) -adaptor = options[:adaptor].to_s.upcase -adaptor_disamb = disambiguate(adaptor) +tmpdir = Biopieces.mktmpdir +file_fasta = File.join(tmpdir, "data.fna") +file_records = File.join(tmpdir, "data.stream") +file_pattern = File.join(tmpdir, "pattern.txt") +file_patscan = File.join(tmpdir, "patscan.fna") -pos = options[:pos] -pos -= 1 if pos > 0 # pos was 1-based +count = 0 -cache = {} +Biopieces.open(options[:stream_in], file_records) do |input, output| + Fasta.open(file_fasta, mode='w') do |out_fa| + input.each do |record| + output.puts record -Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| - input.each_record do |record| - if record.has_key? :SEQ - entry = Seq.new(record[:SEQ_NAME], record[:SEQ], "dna", record[:SCORES]) + if record.has_key? :SEQ + record[:SEQ_NAME] = count + out_fa.puts record - if cache[entry.seq.upcase.to_sym] and options[:cache] - match = cache[entry.seq.upcase] - else - match = entry.adaptor_find(adaptor, adaptor_disamb, pos, options[:edit_distance], options[:dist]) - - cache[entry.seq.upcase.to_sym] = match if match and options[:cache] + count += 1; end + end + end +end + +patscan = PatScan.new(options, file_fasta, file_pattern, file_patscan) +patscan.run +matches = patscan.parse_results - if match - record[:ADAPTOR_POS] = match.pos - record[:ADAPTOR_LEN] = match.length - record[:ADAPTOR_MATCH] = match.match +count = 0 + +Biopieces.open(file_records, options[:stream_out]) do |input, output| + input.each_record do |record| + if record.has_key? :SEQ + if matches.has_key? count + record[:ADAPTOR_POS] = matches[count].first + record[:ADAPTOR_LEN] = matches[count].last end + + count += 1; end output.puts record diff --git a/code_ruby/lib/maasha/biopieces.rb b/code_ruby/lib/maasha/biopieces.rb index c1dc49b..5fd6edf 100644 --- a/code_ruby/lib/maasha/biopieces.rb +++ b/code_ruby/lib/maasha/biopieces.rb @@ -149,22 +149,22 @@ class Biopieces # Class method for opening data stream for reading Biopiece records. # Records are read from STDIN (default) or file (possibly gzipped). def self.open_input(input) - if STDIN.tty? - if input.nil? + if input.nil? + if STDIN.tty? input = self.new(StringIO.new) else - ios = File.open(input, mode='r') - - begin - ios = Zlib::GzipReader.new(ios) - rescue - ios.rewind - end + input = self.new(STDIN, stdio = true) + end + elsif File.exists? input + ios = File.open(input, mode='r') - input = self.new(ios) + begin + ios = Zlib::GzipReader.new(ios) + rescue + ios.rewind end - else - input = self.new(STDIN, stdio = true) + + input = self.new(ios) end input -- 2.39.5