From: martinahansen Date: Fri, 13 May 2011 10:11:55 +0000 (+0000) Subject: add dist option to find_adaptor X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=34ab63ac19cd0277c6b70697b25539fc145dcc22;p=biopieces.git add dist option to find_adaptor git-svn-id: http://biopieces.googlecode.com/svn/trunk@1403 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/bp_bin/find_adaptor b/bp_bin/find_adaptor index cfa050f..e1ac8f3 100755 --- a/bp_bin/find_adaptor +++ b/bp_bin/find_adaptor @@ -53,19 +53,19 @@ class Seq # Method that finds an adaptor or part thereof in the sequence of a Seq object. # Returns a Match object if the adaptor was found otherwise nil. The ed_percent # indicates the maximum edit distance allowed in all possible overlaps. - def adaptor_find(adaptor, adaptor_disamb, pos = 0, ed_percent = 0) + def adaptor_find(adaptor, adaptor_disamb, pos = 0, ed_percent = 0, dist = 0) raise SeqError, "Edit distance percent out of range #{ed_percent}" unless (0 .. 100).include? ed_percent if pos < 0 pos = self.length + pos # pos offset from the right end end - if pos < self.length + if pos < self.length - dist + 1 if match = adaptor_find_simple(adaptor_disamb, pos) return match elsif match = adaptor_find_complex(adaptor, pos, ed_percent) return match - elsif match = adaptor_partial_find_complex(adaptor, pos, ed_percent) + elsif match = adaptor_partial_find_complex(adaptor, pos, ed_percent, dist) return match end end @@ -93,19 +93,19 @@ class Seq # Method to find part of an adaptor at the right end of a sequence taking # into account ambiguity codes, mismatches, insertions, and deletions. - def adaptor_partial_find_complex(adaptor, pos, ed_percent) + def adaptor_partial_find_complex(adaptor, pos, ed_percent, dist) if pos > self.length - adaptor.length adaptor = adaptor[0 ... self.length - pos] else - adaptor = adaptor[0 ... adaptor.length - 1] + adaptor = adaptor[0 ... adaptor.length] pos = self.length - adaptor.length end - #puts self.seq + # puts self.seq - while adaptor.length > 0 - #puts (" " * pos) + adaptor + while adaptor.length > 0 and adaptor.length >= dist + # puts (" " * pos) + adaptor ed_max = (adaptor.length * ed_percent * 0.01).round @@ -130,6 +130,7 @@ casts = [] casts << {:long=>'adaptor', :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} casts << {:long=>'edit_distance', :short=>'e', :type=>'uint', :mandatory=>false, :default=>20, :allowed=>nil, :disallowed=>nil} casts << {:long=>'pos', :short=>'p', :type=>'int', :mandatory=>false, :default=>1, :allowed=>nil, :disallowed=>"0"} +casts << {:long=>'dist', :short=>'d', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>nil, :disallowed=>nil} casts << {:long=>'cache', :short=>'c', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} bp = Biopieces.new @@ -151,7 +152,7 @@ bp.each_record do |record| if cache[entry.seq.upcase.to_sym] and options[:cache] match = cache[entry.seq.upcase] else - match = entry.adaptor_find(adaptor, adaptor_disamb, pos, options[:edit_distance]) + match = entry.adaptor_find(adaptor, adaptor_disamb, pos, options[:edit_distance], options[:dist]) cache[entry.seq.upcase.to_sym] = match if match and options[:cache] end diff --git a/bp_test/out/find_adaptor.out.6 b/bp_test/out/find_adaptor.out.6 new file mode 100644 index 0000000..45197a2 --- /dev/null +++ b/bp_test/out/find_adaptor.out.6 @@ -0,0 +1,50 @@ +SEQ_NAME: test_full_length_adaptor +SEQ: TCGTATGCCGTCTTCTGCTTG +SEQ_LEN: 21 +ADAPTOR_POS: 0 +ADAPTOR_LEN: 21 +ADAPTOR_MATCH: TCGTATGCCGTCTTCTGCTTG +--- +SEQ_NAME: test_begin_match +SEQ: TCGTATGCCGTCTTCTGCTTGactacgt +SEQ_LEN: 28 +ADAPTOR_POS: 0 +ADAPTOR_LEN: 21 +ADAPTOR_MATCH: TCGTATGCCGTCTTCTGCTTG +--- +SEQ_NAME: test_middle_match +SEQ: actgactgaTCGTATGCCGTCTTCTGCTTGactacgt +SEQ_LEN: 37 +ADAPTOR_POS: 9 +ADAPTOR_LEN: 21 +ADAPTOR_MATCH: TCGTATGCCGTCTTCTGCTTG +--- +SEQ_NAME: test_end_match +SEQ: gactgaTCGTATGCCGTCTTCTGCTTG +SEQ_LEN: 27 +ADAPTOR_POS: 6 +ADAPTOR_LEN: 21 +ADAPTOR_MATCH: TCGTATGCCGTCTTCTGCTTG +--- +SEQ_NAME: test_pos_begin +SEQ: gactgaTCGTATGCCGTCTTCTGCTTGgactgaTCGTATGCCGTCTTCTGCTTGacgta +SEQ_LEN: 59 +ADAPTOR_POS: 6 +ADAPTOR_LEN: 21 +ADAPTOR_MATCH: TCGTATGCCGTCTTCTGCTTG +--- +SEQ_NAME: test_edit_dist_5 +SEQ: actgactgaTCGGATGCGGTCTCATGTTGactacgt +SEQ_LEN: 36 +--- +SEQ_NAME: test_end_trim +SEQ: gtgacactatcgatacgatcgacactgaTCGTA +SEQ_LEN: 33 +ADAPTOR_POS: 28 +ADAPTOR_LEN: 5 +ADAPTOR_MATCH: TCGTA +--- +SEQ_NAME: test_end_trim_edit_dist_1 +SEQ: gatgatcgtagcgatcgatcgacgctgaTCGTG +SEQ_LEN: 33 +--- diff --git a/bp_test/test/test_find_adaptor b/bp_test/test/test_find_adaptor index 8308f65..a769cfd 100755 --- a/bp_test/test/test_find_adaptor +++ b/bp_test/test/test_find_adaptor @@ -21,3 +21,7 @@ clean run "$bp -a TCGTATGCCGTCTTCTGCTTG -e 30 -I $in -O $tmp" assert_no_diff $tmp $out.5 clean + +run "$bp -a TCGTATGCCGTCTTCTGCTTG -e 0 -d 5 -I $in -O $tmp" +assert_no_diff $tmp $out.6 +clean