]> git.donarmstrong.com Git - biopieces.git/commitdiff
rewrote remove_mids again
authormartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Tue, 6 Sep 2011 09:53:26 +0000 (09:53 +0000)
committermartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Tue, 6 Sep 2011 09:53:26 +0000 (09:53 +0000)
git-svn-id: http://biopieces.googlecode.com/svn/trunk@1521 74ccb610-7750-0410-82ae-013aeee3265d

bp_bin/remove_mids
bp_test/out/remove_mids.out.5 [new file with mode: 0644]
bp_test/out/remove_mids.out.6 [new file with mode: 0644]
bp_test/test/test_remove_mids

index 74846c1fe4dcd56a67cc8604954e1737e2124fc5..c73d23ceec86f5fbaf51555e94402ab91c4a49a3 100755 (executable)
@@ -32,7 +32,10 @@ require 'maasha/biopieces'
 require 'maasha/bits'
 require 'pp'
 
-gsmid_hash = {
+GS_SIZE = 10
+RL_SIZE = 11
+
+GSMID_HASH = {
   ACGAGTGCGT: "MID1",
   TCTCTATGCG: "MID10",
   TAGACTGCAC: "MID100",
@@ -188,7 +191,7 @@ gsmid_hash = {
   CTGTACATAC: "MID99",
 }
 
-rlmid_hash = {
+RLMID_HASH = {
   ACACGACGACT: "RL1",
   ACACGTAGTAT: "RL2",
   ACACTACTCGT: "RL3",
@@ -203,69 +206,95 @@ rlmid_hash = {
   ACTCGCGTCGT: "RL12" 
 }
 
-casts = []
-casts << {:long=>'pos',        :short=>'p', :type=>'uint', :mandatory=>false, :default=>0,   :allowed=>nil,     :disallowed=>nil}
-casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>0,   :allowed=>"0,1,2", :disallowed=>nil}
+class MIDfinder
+  def initialize(mid_hash, pos, size, max_mismatches)
+    @mid_hash       = mid_hash
+    @pos            = pos
+    @size           = size
+    @max_mismatches = max_mismatches
+  end
 
-options = Biopieces.options_parse(ARGV, casts)
+  def find_mid(seq)
+    hamming_dist = 0
+    tag          = seq[@pos ... @pos + @size].upcase.to_sym
 
-pos        = options[:pos]
-mismatches = options[:mismatches]
+    if @mid_hash.has_key? tag
+      return MID.new(tag, @mid_hash[tag], @pos, @size, hamming_dist)
+    elsif @max_mismatches > 0
+      @mid_hash.each_pair do |mid, mid_name|
+        hamming_dist = tag.to_s.hamming_distance(mid.to_s)
 
-Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
-  input.each_record do |record|
-    if record.has_key? :SEQ
-      found_tag    = nil
-      hamming_dist = 0
+        if hamming_dist <= @max_mismatches
+          return MID.new(mid, @mid_hash[mid], @pos, @size, hamming_dist)
+        end
+      end
+    end
 
-      tag10 = record[:SEQ][pos ... pos + 10].upcase.to_sym
-      tag11 = record[:SEQ][pos ... pos + 11].upcase.to_sym
+    nil
+  end
 
-      if gsmid_hash.has_key? tag10
-        found_tag = tag10
-      elsif rlmid_hash.has_key? tag11
-        found_tag = tag11
-      end
+  private
 
-      if mismatches > 0 and found_tag.nil?
-        if found_tag.nil?
-          gsmid_hash.each_pair do |mid, mid_name|
-            hamming_dist = tag10.to_s.hamming_distance(mid.to_s)
+  # Class to hold a MID object.
+  class MID
+    attr_accessor :tag, :name, :pos, :len, :mismatches
 
-            if hamming_dist <= mismatches
-              found_tag = mid
+    def initialize(tag, name, pos, len, mismatches)
+      @tag        = tag
+      @name       = name
+      @pos        = pos
+      @len        = len
+      @mismatches = mismatches
+    end
 
-              break
-            end
-          end
-        end
+    def to_hash
+      hash = {}
+      hash[:MID]            = @tag
+      hash[:MID_NAME]       = @name
+      hash[:MID_POS]        = @pos
+      hash[:MID_LEN]        = @len
+      hash[:MID_MISMATCHES] = @mismatches
+      hash
+    end
+
+    def start
+      @pos + @len
+    end
+  end
+end
+casts = []
+casts << {:long=>'pos',        :short=>'p', :type=>'uint', :mandatory=>false, :default=>0,   :allowed=>nil,     :disallowed=>nil}
+casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>0,   :allowed=>"0,1,2", :disallowed=>nil}
+casts << {:long=>'gsmids',     :short=>'g', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil,     :disallowed=>nil}
+casts << {:long=>'rlmids',     :short=>'r', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil,     :disallowed=>nil}
 
-        if found_tag.nil?
-          rlmid_hash.each_pair do |mid, mid_name|
-            hamming_dist = tag11.to_s.hamming_distance(mid.to_s)
+options = Biopieces.options_parse(ARGV, casts)
 
-            if hamming_dist <= mismatches
-              found_tag = mid
+raise "Can't remove only GSMIDs and only RLMIDs simultaniously." if options[:rlmids] and options[:gsmids]
 
-              break
-            end
-          end
-        end
-      end
+pos         = options[:pos]
+mismatches  = options[:mismatches]
 
-      unless found_tag.nil?
-        mid_name = (found_tag.length == 10) ? gsmid_hash[found_tag] : rlmid_hash[found_tag]
+gsfinder = MIDfinder.new(GSMID_HASH, options[:pos], GS_SIZE, options[:mismatches])
+rlfinder = MIDfinder.new(RLMID_HASH, options[:pos], RL_SIZE, options[:mismatches])
 
-        record[:MID]            = found_tag
-        record[:MID_NAME]       = mid_name
-        record[:MID_POS]        = pos
-        record[:MID_LEN]        = found_tag.length
-        record[:MID_MISMATCHES] = hamming_dist
+Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
+  input.each_record do |record|
+    if record.has_key? :SEQ
+
+      if options[:gsmids]
+        mid = gsfinder.find_mid(record[:SEQ])
+      elsif options[:rlmids]
+        mid = rlfinder.find_mid(record[:SEQ])
+      else
+        mid = gsfinder.find_mid(record[:SEQ]) || rlfinder.find_mid(record[:SEQ])
+      end
 
-        start = pos + found_tag.length
+      if mid
+        record.merge!(mid.to_hash)
 
-        record[:SEQ]     = record[:SEQ][start .. -1]
-        record[:SCORES]  = record[:SCORES][start .. -1] if record[:SCORES]
+        record[:SEQ]     = record[:SEQ][mid.start .. -1]
+        record[:SCORES]  = record[:SCORES][mid.start .. -1] if record[:SCORES]
         record[:SEQ_LEN] = record[:SEQ].length
       end
     end
@@ -273,9 +302,9 @@ Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
     output.puts record
   end
 end
-  
-  
+
+
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-  
-  
+
+
 __END__
diff --git a/bp_test/out/remove_mids.out.5 b/bp_test/out/remove_mids.out.5
new file mode 100644 (file)
index 0000000..ce5d1da
--- /dev/null
@@ -0,0 +1,52 @@
+SEQ_NAME: test_RL12_0_mismatches
+SEQ: atcgACTCGCGTCGTgtgactgact
+SEQ_LEN: 25
+---
+SEQ_NAME: test_RL12_1_mismatches
+SEQ: atcgACTCGCcTCGTgtgactgact
+SEQ_LEN: 25
+---
+SEQ_NAME: test_RL12_2_mismatches
+SEQ: atcgACTCaCcTCGTgtgactgact
+SEQ_LEN: 25
+---
+SEQ_NAME: test_RL12_2_mismatches_repeat
+SEQ: atcgACTCaCcTCGTgtgactgact
+SEQ_LEN: 25
+---
+SEQ_NAME: test_GS99_0_mismatches
+SEQ: gtagtagtagt
+SEQ_LEN: 11
+MID: CTGTACATAC
+MID_NAME: MID99
+MID_POS: 4
+MID_LEN: 10
+MID_MISMATCHES: 0
+---
+SEQ_NAME: test_GS99_1_mismatches
+SEQ: gtagtagtagt
+SEQ_LEN: 11
+MID: CTGTACATAC
+MID_NAME: MID99
+MID_POS: 4
+MID_LEN: 10
+MID_MISMATCHES: 1
+---
+SEQ_NAME: test_GS99_2_mismatches
+SEQ: gtagtagtagt
+SEQ_LEN: 11
+MID: CTGTACATAC
+MID_NAME: MID99
+MID_POS: 4
+MID_LEN: 10
+MID_MISMATCHES: 2
+---
+SEQ_NAME: test_GS99_2_mismatches_repeat
+SEQ: gtagtagtagt
+SEQ_LEN: 11
+MID: CTGTACATAC
+MID_NAME: MID99
+MID_POS: 4
+MID_LEN: 10
+MID_MISMATCHES: 2
+---
diff --git a/bp_test/out/remove_mids.out.6 b/bp_test/out/remove_mids.out.6
new file mode 100644 (file)
index 0000000..7cb712a
--- /dev/null
@@ -0,0 +1,52 @@
+SEQ_NAME: test_RL12_0_mismatches
+SEQ: gtgactgact
+SEQ_LEN: 10
+MID: ACTCGCGTCGT
+MID_NAME: RL12
+MID_POS: 4
+MID_LEN: 11
+MID_MISMATCHES: 0
+---
+SEQ_NAME: test_RL12_1_mismatches
+SEQ: gtgactgact
+SEQ_LEN: 10
+MID: ACTCGCGTCGT
+MID_NAME: RL12
+MID_POS: 4
+MID_LEN: 11
+MID_MISMATCHES: 1
+---
+SEQ_NAME: test_RL12_2_mismatches
+SEQ: gtgactgact
+SEQ_LEN: 10
+MID: ACTCGCGTCGT
+MID_NAME: RL12
+MID_POS: 4
+MID_LEN: 11
+MID_MISMATCHES: 2
+---
+SEQ_NAME: test_RL12_2_mismatches_repeat
+SEQ: gtgactgact
+SEQ_LEN: 10
+MID: ACTCGCGTCGT
+MID_NAME: RL12
+MID_POS: 4
+MID_LEN: 11
+MID_MISMATCHES: 2
+---
+SEQ_NAME: test_GS99_0_mismatches
+SEQ: atcgCTGTACATACgtagtagtagt
+SEQ_LEN: 25
+---
+SEQ_NAME: test_GS99_1_mismatches
+SEQ: atcgCTGTtCATACgtagtagtagt
+SEQ_LEN: 25
+---
+SEQ_NAME: test_GS99_2_mismatches
+SEQ: atcgCTGTtCAgACgtagtagtagt
+SEQ_LEN: 25
+---
+SEQ_NAME: test_GS99_2_mismatches_repeat
+SEQ: atcgCTGTtCAgACgtagtagtagt
+SEQ_LEN: 25
+---
index 2179d5e90459a7471efe041d756c0aaa7fe8a955..92d06631e6ce953c0966d7844f591046fa05932a 100755 (executable)
@@ -17,3 +17,11 @@ clean
 run "$bp -I $in -p 4 -m 2 -O $tmp"
 assert_no_diff $tmp $out.4
 clean
+
+run "$bp -I $in -p 4 -m 2 -g -O $tmp"
+assert_no_diff $tmp $out.5
+clean
+
+run "$bp -I $in -p 4 -m 2 -r -O $tmp"
+assert_no_diff $tmp $out.6
+clean