From e90da2f93e0f1f36f8397cbc456000ba1f3061ff Mon Sep 17 00:00:00 2001 From: martinahansen Date: Wed, 30 Mar 2011 09:35:54 +0000 Subject: [PATCH] updated find_mids git-svn-id: http://biopieces.googlecode.com/svn/trunk@1314 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/find_mids | 7 ++- bp_bin/remove_illumina_adaptor | 32 +++-------- code_ruby/Maasha/lib/patternmatcher.rb | 76 +++++++++++--------------- 3 files changed, 46 insertions(+), 69 deletions(-) diff --git a/bp_bin/find_mids b/bp_bin/find_mids index 4e61049..73bbfb6 100755 --- a/bp_bin/find_mids +++ b/bp_bin/find_mids @@ -32,6 +32,8 @@ require 'biopieces' require 'pp' +MID_LEN = 10 + mids = %w{ ACGAGTGCGT ACGCTCGACA AGACGCACTC AGCACTGTAG ATCAGACACG ATATCGCGAG CGTGTCTCTA CTCGCGTGTC TAGTATCAGC TCTCTATGCG TGATACGTCT TACTGAGCTA CATAGTAGTG CGAGAGATAC ATACGACGTA @@ -76,14 +78,17 @@ mids.each_with_index do |mid, i| end casts = [] +casts << {:long=>'pos', :short=>'p', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>nil, :disallowed=>nil} bp = Biopieces.new options = bp.parse(ARGV, casts) +pos = options[:pos] + bp.each_record do |record| if record.has_key? :SEQ - tag = record[:SEQ][4 ... 14].upcase + tag = record[:SEQ][pos ... pos + MID_LEN].upcase if mid_hash.has_key? tag count_hash[tag] += 1 diff --git a/bp_bin/remove_illumina_adaptor b/bp_bin/remove_illumina_adaptor index 7f74c8a..0b0b1b9 100755 --- a/bp_bin/remove_illumina_adaptor +++ b/bp_bin/remove_illumina_adaptor @@ -24,20 +24,20 @@ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# Remove Illumina adaptors or parts thereof from sequences in the stream. +# Remove adaptors or parts thereof from sequences in the stream. # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< require 'biopieces' require 'seq' +require 'profile' casts = [] -casts << {:long=>'min', :short=>'m', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'right_adaptor', :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'left_adaptor', :short=>'l', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'right_hamming_dist', :short=>'R', :type=>'uint', :mandatory=>false, :default=>25, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'left_hamming_dist', :short=>'L', :type=>'uint', :mandatory=>false, :default=>25, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'adaptor', :short=>'r', :type=>'string', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>20, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'insertions', :short=>'i', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'deletions', :short=>'d', :type=>'uint', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil} bp = Biopieces.new @@ -47,25 +47,9 @@ bp.each_record do |record| if record.has_key? :SEQ entry = Seq.new(record[:SEQ_NAME], record[:SEQ], record[:TYPE], record[:SCORES]) - if options[:right_adaptor] - pos_right = entry.adaptor_locate_right(options[:right_adaptor], options[:right_hamming_dist]) - entry.subseq!(0, pos_right) if pos_right >= 0 and entry.length - pos_right >= options[:min] - record[:CLIP_ADAPTOR_RIGHT] = pos_right - else - record[:CLIP_ADAPTOR_RIGHT] = -1 + if match = entry.adaptor_find(options[:adaptor], options[:mismatches], options[:insertions], options[:deletions]) + record[:POS] = match.pos end - - if options[:left_adaptor] - pos_left = entry.adaptor_locate_left(options[:left_adaptor], options[:left_hamming_dist]) - entry.subseq!(pos_left + 1) if pos_left >= options[:min] - record[:CLIP_ADAPTOR_LEFT] = pos_left - else - record[:CLIP_ADAPTOR_LEFT] = -1 - end - - record[:SEQ] = entry.seq - record[:SEQ_LEN] = entry.len - record[:SCORES] = entry.qual end bp.puts record diff --git a/code_ruby/Maasha/lib/patternmatcher.rb b/code_ruby/Maasha/lib/patternmatcher.rb index b4d92b8..17c9f4d 100644 --- a/code_ruby/Maasha/lib/patternmatcher.rb +++ b/code_ruby/Maasha/lib/patternmatcher.rb @@ -61,7 +61,6 @@ module PatternMatcher # located a Match object will be returned. If all paths are exhausted and # no match is located the position is incremented. If no match is located # whatsoever, then nil is returned. - # TODO: converging paths should be skipped for speed-up. def match(pattern, pos = 0, max_mismatches = 0, max_insertions = 0, max_deletions = 0) @pattern = pattern @max_mismatches = max_mismatches @@ -76,20 +75,33 @@ module PatternMatcher new_paths = [] paths.each do |path| - next if path.exhausted?(@seq, @pattern) - return path.to_match if match_found?(path) + new_paths << path - if path.match?(@seq, @pattern) - new_paths << path.match - elsif path.mismatches < max_mismatches - new_paths << path.mismatch + (path.insertions ... @max_insertions).each do |i| + new_path = path.dup + new_path.insertions += i + 1 + new_path.pattern_index += i + 1 + new_paths << new_path end - new_paths << path.insertion if path.insertions < max_insertions - new_paths << path.deletion if path.deletions < max_deletions + (path.deletions ... @max_deletions).each do |i| + new_path = path.dup + new_path.length += i + 1 + new_path.deletions += i + 1 + new_path.seq_index += i + 1 + new_paths << new_path + end end - paths = new_paths + paths = [] + + new_paths.each do |p| + next if p.exhausted?(@seq, @pattern) + next if p.mismatches > @max_mismatches + return p.to_match if match_found?(p) + p.extend(@seq, @pattern) + paths << p + end end pos += 1 @@ -172,41 +184,17 @@ module PatternMatcher Match.new(@pos, @matches, @mismatches, @insertions, @deletions, @length) end - # Method that returns a new Match object for a matching path - def match - path_match = self.dup - path_match.length += 1 - path_match.matches += 1 - path_match.seq_index += 1 - path_match.pattern_index += 1 - path_match - end - - # Method that returns a new Match object for a matching path - def mismatch - path_mismatch = self.dup - path_mismatch.length += 1 - path_mismatch.mismatches += 1 - path_mismatch.seq_index += 1 - path_mismatch.pattern_index += 1 - path_mismatch - end - - # Method that returns a new Match object for a insertion path - def insertion - path_insertion = self.dup - path_insertion.insertions += 1 - path_insertion.pattern_index += 1 - path_insertion - end + # Method that extends a path with either a match or a mismatch. + def extend(seq, pattern) + if self.match?(seq, pattern) + self.matches += 1 + else + self.mismatches += 1 + end - # Method that returns a new Match object for a deletion path - def deletion - path_deletion = self.dup - path_deletion.length += 1 - path_deletion.deletions += 1 - path_deletion.seq_index += 1 - path_deletion + self.length += 1 + self.seq_index += 1 + self.pattern_index += 1 end end -- 2.39.5