From: martinahansen Date: Tue, 6 Sep 2011 09:53:26 +0000 (+0000) Subject: rewrote remove_mids again X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=3f7ec22c0986fb826629451a1c3cfdd3f2274817;p=biopieces.git rewrote remove_mids again git-svn-id: http://biopieces.googlecode.com/svn/trunk@1521 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/bp_bin/remove_mids b/bp_bin/remove_mids index 74846c1..c73d23c 100755 --- a/bp_bin/remove_mids +++ b/bp_bin/remove_mids @@ -32,7 +32,10 @@ require 'maasha/biopieces' require 'maasha/bits' require 'pp' -gsmid_hash = { +GS_SIZE = 10 +RL_SIZE = 11 + +GSMID_HASH = { ACGAGTGCGT: "MID1", TCTCTATGCG: "MID10", TAGACTGCAC: "MID100", @@ -188,7 +191,7 @@ gsmid_hash = { CTGTACATAC: "MID99", } -rlmid_hash = { +RLMID_HASH = { ACACGACGACT: "RL1", ACACGTAGTAT: "RL2", ACACTACTCGT: "RL3", @@ -203,69 +206,95 @@ rlmid_hash = { ACTCGCGTCGT: "RL12" } -casts = [] -casts << {:long=>'pos', :short=>'p', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>"0,1,2", :disallowed=>nil} +class MIDfinder + def initialize(mid_hash, pos, size, max_mismatches) + @mid_hash = mid_hash + @pos = pos + @size = size + @max_mismatches = max_mismatches + end -options = Biopieces.options_parse(ARGV, casts) + def find_mid(seq) + hamming_dist = 0 + tag = seq[@pos ... @pos + @size].upcase.to_sym -pos = options[:pos] -mismatches = options[:mismatches] + if @mid_hash.has_key? tag + return MID.new(tag, @mid_hash[tag], @pos, @size, hamming_dist) + elsif @max_mismatches > 0 + @mid_hash.each_pair do |mid, mid_name| + hamming_dist = tag.to_s.hamming_distance(mid.to_s) -Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| - input.each_record do |record| - if record.has_key? :SEQ - found_tag = nil - hamming_dist = 0 + if hamming_dist <= @max_mismatches + return MID.new(mid, @mid_hash[mid], @pos, @size, hamming_dist) + end + end + end - tag10 = record[:SEQ][pos ... pos + 10].upcase.to_sym - tag11 = record[:SEQ][pos ... pos + 11].upcase.to_sym + nil + end - if gsmid_hash.has_key? tag10 - found_tag = tag10 - elsif rlmid_hash.has_key? tag11 - found_tag = tag11 - end + private - if mismatches > 0 and found_tag.nil? - if found_tag.nil? - gsmid_hash.each_pair do |mid, mid_name| - hamming_dist = tag10.to_s.hamming_distance(mid.to_s) + # Class to hold a MID object. + class MID + attr_accessor :tag, :name, :pos, :len, :mismatches - if hamming_dist <= mismatches - found_tag = mid + def initialize(tag, name, pos, len, mismatches) + @tag = tag + @name = name + @pos = pos + @len = len + @mismatches = mismatches + end - break - end - end - end + def to_hash + hash = {} + hash[:MID] = @tag + hash[:MID_NAME] = @name + hash[:MID_POS] = @pos + hash[:MID_LEN] = @len + hash[:MID_MISMATCHES] = @mismatches + hash + end + + def start + @pos + @len + end + end +end +casts = [] +casts << {:long=>'pos', :short=>'p', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>"0,1,2", :disallowed=>nil} +casts << {:long=>'gsmids', :short=>'g', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'rlmids', :short=>'r', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} - if found_tag.nil? - rlmid_hash.each_pair do |mid, mid_name| - hamming_dist = tag11.to_s.hamming_distance(mid.to_s) +options = Biopieces.options_parse(ARGV, casts) - if hamming_dist <= mismatches - found_tag = mid +raise "Can't remove only GSMIDs and only RLMIDs simultaniously." if options[:rlmids] and options[:gsmids] - break - end - end - end - end +pos = options[:pos] +mismatches = options[:mismatches] - unless found_tag.nil? - mid_name = (found_tag.length == 10) ? gsmid_hash[found_tag] : rlmid_hash[found_tag] +gsfinder = MIDfinder.new(GSMID_HASH, options[:pos], GS_SIZE, options[:mismatches]) +rlfinder = MIDfinder.new(RLMID_HASH, options[:pos], RL_SIZE, options[:mismatches]) - record[:MID] = found_tag - record[:MID_NAME] = mid_name - record[:MID_POS] = pos - record[:MID_LEN] = found_tag.length - record[:MID_MISMATCHES] = hamming_dist +Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| + input.each_record do |record| + if record.has_key? :SEQ + + if options[:gsmids] + mid = gsfinder.find_mid(record[:SEQ]) + elsif options[:rlmids] + mid = rlfinder.find_mid(record[:SEQ]) + else + mid = gsfinder.find_mid(record[:SEQ]) || rlfinder.find_mid(record[:SEQ]) + end - start = pos + found_tag.length + if mid + record.merge!(mid.to_hash) - record[:SEQ] = record[:SEQ][start .. -1] - record[:SCORES] = record[:SCORES][start .. -1] if record[:SCORES] + record[:SEQ] = record[:SEQ][mid.start .. -1] + record[:SCORES] = record[:SCORES][mid.start .. -1] if record[:SCORES] record[:SEQ_LEN] = record[:SEQ].length end end @@ -273,9 +302,9 @@ Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| output.puts record end end - - + + # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - + + __END__ diff --git a/bp_test/out/remove_mids.out.5 b/bp_test/out/remove_mids.out.5 new file mode 100644 index 0000000..ce5d1da --- /dev/null +++ b/bp_test/out/remove_mids.out.5 @@ -0,0 +1,52 @@ +SEQ_NAME: test_RL12_0_mismatches +SEQ: atcgACTCGCGTCGTgtgactgact +SEQ_LEN: 25 +--- +SEQ_NAME: test_RL12_1_mismatches +SEQ: atcgACTCGCcTCGTgtgactgact +SEQ_LEN: 25 +--- +SEQ_NAME: test_RL12_2_mismatches +SEQ: atcgACTCaCcTCGTgtgactgact +SEQ_LEN: 25 +--- +SEQ_NAME: test_RL12_2_mismatches_repeat +SEQ: atcgACTCaCcTCGTgtgactgact +SEQ_LEN: 25 +--- +SEQ_NAME: test_GS99_0_mismatches +SEQ: gtagtagtagt +SEQ_LEN: 11 +MID: CTGTACATAC +MID_NAME: MID99 +MID_POS: 4 +MID_LEN: 10 +MID_MISMATCHES: 0 +--- +SEQ_NAME: test_GS99_1_mismatches +SEQ: gtagtagtagt +SEQ_LEN: 11 +MID: CTGTACATAC +MID_NAME: MID99 +MID_POS: 4 +MID_LEN: 10 +MID_MISMATCHES: 1 +--- +SEQ_NAME: test_GS99_2_mismatches +SEQ: gtagtagtagt +SEQ_LEN: 11 +MID: CTGTACATAC +MID_NAME: MID99 +MID_POS: 4 +MID_LEN: 10 +MID_MISMATCHES: 2 +--- +SEQ_NAME: test_GS99_2_mismatches_repeat +SEQ: gtagtagtagt +SEQ_LEN: 11 +MID: CTGTACATAC +MID_NAME: MID99 +MID_POS: 4 +MID_LEN: 10 +MID_MISMATCHES: 2 +--- diff --git a/bp_test/out/remove_mids.out.6 b/bp_test/out/remove_mids.out.6 new file mode 100644 index 0000000..7cb712a --- /dev/null +++ b/bp_test/out/remove_mids.out.6 @@ -0,0 +1,52 @@ +SEQ_NAME: test_RL12_0_mismatches +SEQ: gtgactgact +SEQ_LEN: 10 +MID: ACTCGCGTCGT +MID_NAME: RL12 +MID_POS: 4 +MID_LEN: 11 +MID_MISMATCHES: 0 +--- +SEQ_NAME: test_RL12_1_mismatches +SEQ: gtgactgact +SEQ_LEN: 10 +MID: ACTCGCGTCGT +MID_NAME: RL12 +MID_POS: 4 +MID_LEN: 11 +MID_MISMATCHES: 1 +--- +SEQ_NAME: test_RL12_2_mismatches +SEQ: gtgactgact +SEQ_LEN: 10 +MID: ACTCGCGTCGT +MID_NAME: RL12 +MID_POS: 4 +MID_LEN: 11 +MID_MISMATCHES: 2 +--- +SEQ_NAME: test_RL12_2_mismatches_repeat +SEQ: gtgactgact +SEQ_LEN: 10 +MID: ACTCGCGTCGT +MID_NAME: RL12 +MID_POS: 4 +MID_LEN: 11 +MID_MISMATCHES: 2 +--- +SEQ_NAME: test_GS99_0_mismatches +SEQ: atcgCTGTACATACgtagtagtagt +SEQ_LEN: 25 +--- +SEQ_NAME: test_GS99_1_mismatches +SEQ: atcgCTGTtCATACgtagtagtagt +SEQ_LEN: 25 +--- +SEQ_NAME: test_GS99_2_mismatches +SEQ: atcgCTGTtCAgACgtagtagtagt +SEQ_LEN: 25 +--- +SEQ_NAME: test_GS99_2_mismatches_repeat +SEQ: atcgCTGTtCAgACgtagtagtagt +SEQ_LEN: 25 +--- diff --git a/bp_test/test/test_remove_mids b/bp_test/test/test_remove_mids index 2179d5e..92d0663 100755 --- a/bp_test/test/test_remove_mids +++ b/bp_test/test/test_remove_mids @@ -17,3 +17,11 @@ clean run "$bp -I $in -p 4 -m 2 -O $tmp" assert_no_diff $tmp $out.4 clean + +run "$bp -I $in -p 4 -m 2 -g -O $tmp" +assert_no_diff $tmp $out.5 +clean + +run "$bp -I $in -p 4 -m 2 -r -O $tmp" +assert_no_diff $tmp $out.6 +clean