From: martinahansen Date: Mon, 18 Apr 2011 07:45:54 +0000 (+0000) Subject: added find_adaptor biopiece X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=e8a22d61f6c97f1831574bbde8f737978d9208d9;p=biopieces.git added find_adaptor biopiece git-svn-id: http://biopieces.googlecode.com/svn/trunk@1330 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/bp_bin/find_adaptor b/bp_bin/find_adaptor new file mode 100755 index 0000000..f19648e --- /dev/null +++ b/bp_bin/find_adaptor @@ -0,0 +1,150 @@ +#!/usr/bin/env ruby + +# Copyright (C) 2007-2011 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This program is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Remove adaptors or parts thereof from sequences in the stream. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +require 'biopieces' +require 'seq' + +def disambiguate(adaptor) + adaptor_disamb = adaptor.dup + adaptor_disamb.gsub!('U', 'T') + adaptor_disamb.gsub!('R', '[AG]') + adaptor_disamb.gsub!('Y', '[CT]') + adaptor_disamb.gsub!('S', '[GC]') + adaptor_disamb.gsub!('W', '[AT]') + adaptor_disamb.gsub!('M', '[AC]') + adaptor_disamb.gsub!('K', '[GT]') + adaptor_disamb.gsub!('V', '[ACG]') + adaptor_disamb.gsub!('H', '[ACT]') + adaptor_disamb.gsub!('D', '[AGT]') + adaptor_disamb.gsub!('B', '[CGT]') + adaptor_disamb.gsub!('N', '.') + adaptor_disamb +end + +class Seq + # Method that finds an adaptor or part thereof in the sequence of a Seq object. + # Returns a Match object if the adaptor was found otherwise nil. The ed_percent + # indicates the maximum edit distance allowed in all possible overlaps. + def adaptor_find(adaptor, adaptor_disamb, pos = 0, ed_percent = 0) + raise SeqError, "Edit distance percent out of range #{ed_percent}" unless (0 .. 100).include? ed_percent + + if pos < 0 + pos = self.length - pos # pos offset from the right end + end + + match = adaptor_find_simple(adaptor_disamb, pos) or + adaptor_find_complex(adaptor, pos, ed_percent) or + adaptor_partial_find_complex(adaptor, ed_percent) + + match + end + + private + + # Method to find an adaptor in a sequence taking into account ambiguity + # codes, but not considering mismatches, insertions, and deletions. + def adaptor_find_simple(adaptor, pos) + self.seq.upcase.match(adaptor, pos) do |m| + return Match.new($`.length, m, m.to_s.length, 0, 0, 0, m.to_s.length) + end + end + + # Method to find an adaptor in a sequence taking into account ambiguity + # codes, mismatches, insertions, and deletions. + def adaptor_find_complex(adaptor, pos, ed_percent) + ed_max = (adaptor.length * ed_percent * 0.01).round + + self.scan(adaptor, pos, ed_max).each do |match| + return match + end + end + + # Method to find part of an adaptor at the right end of a sequence taking + # into account ambiguity codes, mismatches, insertions, and deletions. + def adaptor_partial_find_complex(adaptor, ed_percent) + adaptor = adaptor[0 ... -1] + + pos = self.len - adaptor.length + + while adaptor.length > 0 + ed_max = (adaptor.length * ed_percent * 0.01).round + + if ed_max == 0 + self.seq.upcase.match(adaptor, pos) do |m| + return Match.new($`.length, m, m.to_s.length, 0, 0, 0, m.to_s.length) + end + else + self.scan(adaptor, pos, ed_max).each do |match| + return match + end + end + + adaptor = adaptor[0 ... -1] + + pos = self.len - adaptor.length + end + end +end + +casts = [] +casts << {:long=>'adaptor', :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'edit_distance', :short=>'e', :type=>'uint', :mandatory=>false, :default=>20, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'pos', :short=>'p', :type=>'int', :mandatory=>false, :default=>1, :allowed=>nil, :disallowed=>"0"} + +bp = Biopieces.new + +options = bp.parse(ARGV, casts) + +adaptor = options[:adaptor].to_s.upcase +adaptor_disamb = disambiguate(adaptor) + +pos = options[:pos] +pos -= 1 if pos > 0 # pos was 1-based + +bp.each_record do |record| + if record.has_key? :SEQ + entry = Seq.new(record[:SEQ_NAME], record[:SEQ], "dna", record[:SCORES]) + + if match = entry.adaptor_find(adaptor, adaptor_disamb, options[:pos], options[:edit_distance]) + record[:ADAPTOR_POS] = match.pos + record[:ADAPTOR_LEN] = match.length + record[:ADAPTOR_MATCH] = match.match + end + end + + bp.puts record +end + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ diff --git a/bp_bin/remove_illumina_adaptor b/bp_bin/remove_illumina_adaptor deleted file mode 100755 index 0b0b1b9..0000000 --- a/bp_bin/remove_illumina_adaptor +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env ruby - -# Copyright (C) 2007-2011 Martin A. Hansen. - -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - -# http://www.gnu.org/copyleft/gpl.html - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -# This program is part of the Biopieces framework (www.biopieces.org). - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -# Remove adaptors or parts thereof from sequences in the stream. - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -require 'biopieces' -require 'seq' -require 'profile' - -casts = [] -casts << {:long=>'adaptor', :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>20, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'insertions', :short=>'i', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'deletions', :short=>'d', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil} - -bp = Biopieces.new - -options = bp.parse(ARGV, casts) - -bp.each_record do |record| - if record.has_key? :SEQ - entry = Seq.new(record[:SEQ_NAME], record[:SEQ], record[:TYPE], record[:SCORES]) - - if match = entry.adaptor_find(options[:adaptor], options[:mismatches], options[:insertions], options[:deletions]) - record[:POS] = match.pos - end - end - - bp.puts record -end - - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -__END__ diff --git a/code_ruby/Maasha/lib/biopieces.rb b/code_ruby/Maasha/lib/biopieces.rb index 37edd66..e8486c3 100644 --- a/code_ruby/Maasha/lib/biopieces.rb +++ b/code_ruby/Maasha/lib/biopieces.rb @@ -289,7 +289,7 @@ class OptionHandler end when 'string' option.on("-#{cast[:short]}", "--#{cast[:long]} S", String) do |s| - @options[cast[:long]] = s.to_sym + @options[cast[:long]] = s.to_sym # TODO: this to_sym - is that needed? end when REGEX_LIST option.on( "-#{cast[:short]}", "--#{cast[:long]} A", Array) do |a|