# Method that finds an adaptor or part thereof in the sequence of a Seq object.
# Returns a Match object if the adaptor was found otherwise nil. The ed_percent
# indicates the maximum edit distance allowed in all possible overlaps.
- def adaptor_find(adaptor, adaptor_disamb, pos = 0, ed_percent = 0)
+ def adaptor_find(adaptor, adaptor_disamb, pos = 0, ed_percent = 0, dist = 0)
raise SeqError, "Edit distance percent out of range #{ed_percent}" unless (0 .. 100).include? ed_percent
if pos < 0
pos = self.length + pos # pos offset from the right end
end
- if pos < self.length
+ if pos < self.length - dist + 1
if match = adaptor_find_simple(adaptor_disamb, pos)
return match
elsif match = adaptor_find_complex(adaptor, pos, ed_percent)
return match
- elsif match = adaptor_partial_find_complex(adaptor, pos, ed_percent)
+ elsif match = adaptor_partial_find_complex(adaptor, pos, ed_percent, dist)
return match
end
end
# Method to find part of an adaptor at the right end of a sequence taking
# into account ambiguity codes, mismatches, insertions, and deletions.
- def adaptor_partial_find_complex(adaptor, pos, ed_percent)
+ def adaptor_partial_find_complex(adaptor, pos, ed_percent, dist)
if pos > self.length - adaptor.length
adaptor = adaptor[0 ... self.length - pos]
else
- adaptor = adaptor[0 ... adaptor.length - 1]
+ adaptor = adaptor[0 ... adaptor.length]
pos = self.length - adaptor.length
end
- #puts self.seq
+ # puts self.seq
- while adaptor.length > 0
- #puts (" " * pos) + adaptor
+ while adaptor.length > 0 and adaptor.length >= dist
+ # puts (" " * pos) + adaptor
ed_max = (adaptor.length * ed_percent * 0.01).round
casts << {:long=>'adaptor', :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
casts << {:long=>'edit_distance', :short=>'e', :type=>'uint', :mandatory=>false, :default=>20, :allowed=>nil, :disallowed=>nil}
casts << {:long=>'pos', :short=>'p', :type=>'int', :mandatory=>false, :default=>1, :allowed=>nil, :disallowed=>"0"}
+casts << {:long=>'dist', :short=>'d', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>nil, :disallowed=>nil}
casts << {:long=>'cache', :short=>'c', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
bp = Biopieces.new
if cache[entry.seq.upcase.to_sym] and options[:cache]
match = cache[entry.seq.upcase]
else
- match = entry.adaptor_find(adaptor, adaptor_disamb, pos, options[:edit_distance])
+ match = entry.adaptor_find(adaptor, adaptor_disamb, pos, options[:edit_distance], options[:dist])
cache[entry.seq.upcase.to_sym] = match if match and options[:cache]
end
--- /dev/null
+SEQ_NAME: test_full_length_adaptor
+SEQ: TCGTATGCCGTCTTCTGCTTG
+SEQ_LEN: 21
+ADAPTOR_POS: 0
+ADAPTOR_LEN: 21
+ADAPTOR_MATCH: TCGTATGCCGTCTTCTGCTTG
+---
+SEQ_NAME: test_begin_match
+SEQ: TCGTATGCCGTCTTCTGCTTGactacgt
+SEQ_LEN: 28
+ADAPTOR_POS: 0
+ADAPTOR_LEN: 21
+ADAPTOR_MATCH: TCGTATGCCGTCTTCTGCTTG
+---
+SEQ_NAME: test_middle_match
+SEQ: actgactgaTCGTATGCCGTCTTCTGCTTGactacgt
+SEQ_LEN: 37
+ADAPTOR_POS: 9
+ADAPTOR_LEN: 21
+ADAPTOR_MATCH: TCGTATGCCGTCTTCTGCTTG
+---
+SEQ_NAME: test_end_match
+SEQ: gactgaTCGTATGCCGTCTTCTGCTTG
+SEQ_LEN: 27
+ADAPTOR_POS: 6
+ADAPTOR_LEN: 21
+ADAPTOR_MATCH: TCGTATGCCGTCTTCTGCTTG
+---
+SEQ_NAME: test_pos_begin
+SEQ: gactgaTCGTATGCCGTCTTCTGCTTGgactgaTCGTATGCCGTCTTCTGCTTGacgta
+SEQ_LEN: 59
+ADAPTOR_POS: 6
+ADAPTOR_LEN: 21
+ADAPTOR_MATCH: TCGTATGCCGTCTTCTGCTTG
+---
+SEQ_NAME: test_edit_dist_5
+SEQ: actgactgaTCGGATGCGGTCTCATGTTGactacgt
+SEQ_LEN: 36
+---
+SEQ_NAME: test_end_trim
+SEQ: gtgacactatcgatacgatcgacactgaTCGTA
+SEQ_LEN: 33
+ADAPTOR_POS: 28
+ADAPTOR_LEN: 5
+ADAPTOR_MATCH: TCGTA
+---
+SEQ_NAME: test_end_trim_edit_dist_1
+SEQ: gatgatcgtagcgatcgatcgacgctgaTCGTG
+SEQ_LEN: 33
+---