require 'biopieces'
require 'pp'
+MID_LEN = 10
+
mids = %w{ ACGAGTGCGT ACGCTCGACA AGACGCACTC AGCACTGTAG ATCAGACACG
ATATCGCGAG CGTGTCTCTA CTCGCGTGTC TAGTATCAGC TCTCTATGCG
TGATACGTCT TACTGAGCTA CATAGTAGTG CGAGAGATAC ATACGACGTA
end
casts = []
+casts << {:long=>'pos', :short=>'p', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>nil, :disallowed=>nil}
bp = Biopieces.new
options = bp.parse(ARGV, casts)
+pos = options[:pos]
+
bp.each_record do |record|
if record.has_key? :SEQ
- tag = record[:SEQ][4 ... 14].upcase
+ tag = record[:SEQ][pos ... pos + MID_LEN].upcase
if mid_hash.has_key? tag
count_hash[tag] += 1
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-# Remove Illumina adaptors or parts thereof from sequences in the stream.
+# Remove adaptors or parts thereof from sequences in the stream.
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
require 'biopieces'
require 'seq'
+require 'profile'
casts = []
-casts << {:long=>'min', :short=>'m', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'right_adaptor', :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'left_adaptor', :short=>'l', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'right_hamming_dist', :short=>'R', :type=>'uint', :mandatory=>false, :default=>25, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'left_hamming_dist', :short=>'L', :type=>'uint', :mandatory=>false, :default=>25, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'adaptor', :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>20, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'insertions', :short=>'i', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'deletions', :short=>'d', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil}
bp = Biopieces.new
if record.has_key? :SEQ
entry = Seq.new(record[:SEQ_NAME], record[:SEQ], record[:TYPE], record[:SCORES])
- if options[:right_adaptor]
- pos_right = entry.adaptor_locate_right(options[:right_adaptor], options[:right_hamming_dist])
- entry.subseq!(0, pos_right) if pos_right >= 0 and entry.length - pos_right >= options[:min]
- record[:CLIP_ADAPTOR_RIGHT] = pos_right
- else
- record[:CLIP_ADAPTOR_RIGHT] = -1
+ if match = entry.adaptor_find(options[:adaptor], options[:mismatches], options[:insertions], options[:deletions])
+ record[:POS] = match.pos
end
-
- if options[:left_adaptor]
- pos_left = entry.adaptor_locate_left(options[:left_adaptor], options[:left_hamming_dist])
- entry.subseq!(pos_left + 1) if pos_left >= options[:min]
- record[:CLIP_ADAPTOR_LEFT] = pos_left
- else
- record[:CLIP_ADAPTOR_LEFT] = -1
- end
-
- record[:SEQ] = entry.seq
- record[:SEQ_LEN] = entry.len
- record[:SCORES] = entry.qual
end
bp.puts record
# located a Match object will be returned. If all paths are exhausted and
# no match is located the position is incremented. If no match is located
# whatsoever, then nil is returned.
- # TODO: converging paths should be skipped for speed-up.
def match(pattern, pos = 0, max_mismatches = 0, max_insertions = 0, max_deletions = 0)
@pattern = pattern
@max_mismatches = max_mismatches
new_paths = []
paths.each do |path|
- next if path.exhausted?(@seq, @pattern)
- return path.to_match if match_found?(path)
+ new_paths << path
- if path.match?(@seq, @pattern)
- new_paths << path.match
- elsif path.mismatches < max_mismatches
- new_paths << path.mismatch
+ (path.insertions ... @max_insertions).each do |i|
+ new_path = path.dup
+ new_path.insertions += i + 1
+ new_path.pattern_index += i + 1
+ new_paths << new_path
end
- new_paths << path.insertion if path.insertions < max_insertions
- new_paths << path.deletion if path.deletions < max_deletions
+ (path.deletions ... @max_deletions).each do |i|
+ new_path = path.dup
+ new_path.length += i + 1
+ new_path.deletions += i + 1
+ new_path.seq_index += i + 1
+ new_paths << new_path
+ end
end
- paths = new_paths
+ paths = []
+
+ new_paths.each do |p|
+ next if p.exhausted?(@seq, @pattern)
+ next if p.mismatches > @max_mismatches
+ return p.to_match if match_found?(p)
+ p.extend(@seq, @pattern)
+ paths << p
+ end
end
pos += 1
Match.new(@pos, @matches, @mismatches, @insertions, @deletions, @length)
end
- # Method that returns a new Match object for a matching path
- def match
- path_match = self.dup
- path_match.length += 1
- path_match.matches += 1
- path_match.seq_index += 1
- path_match.pattern_index += 1
- path_match
- end
-
- # Method that returns a new Match object for a matching path
- def mismatch
- path_mismatch = self.dup
- path_mismatch.length += 1
- path_mismatch.mismatches += 1
- path_mismatch.seq_index += 1
- path_mismatch.pattern_index += 1
- path_mismatch
- end
-
- # Method that returns a new Match object for a insertion path
- def insertion
- path_insertion = self.dup
- path_insertion.insertions += 1
- path_insertion.pattern_index += 1
- path_insertion
- end
+ # Method that extends a path with either a match or a mismatch.
+ def extend(seq, pattern)
+ if self.match?(seq, pattern)
+ self.matches += 1
+ else
+ self.mismatches += 1
+ end
- # Method that returns a new Match object for a deletion path
- def deletion
- path_deletion = self.dup
- path_deletion.length += 1
- path_deletion.deletions += 1
- path_deletion.seq_index += 1
- path_deletion
+ self.length += 1
+ self.seq_index += 1
+ self.pattern_index += 1
end
end