]> git.donarmstrong.com Git - biopieces.git/commitdiff
updated remove_mids
authormartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Mon, 5 Sep 2011 20:43:34 +0000 (20:43 +0000)
committermartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Mon, 5 Sep 2011 20:43:34 +0000 (20:43 +0000)
git-svn-id: http://biopieces.googlecode.com/svn/trunk@1518 74ccb610-7750-0410-82ae-013aeee3265d

bp_bin/remove_mids
bp_test/in/remove_mids.in
bp_test/out/remove_mids.out.1
bp_test/out/remove_mids.out.2
bp_test/test/test_remove_mids

index 44714f8b452fae2fb03fc0a374b32017aacdd289..39f575c688dc3fc80a80e39c5e37ea497565c3f0 100755 (executable)
 
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
-# Find and remove MID tags in sequences in the stream.
+# Find MID tags in sequences in the stream.
 
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
-
 require 'maasha/biopieces'
+require 'maasha/bits'
 require 'pp'
 
-MID_LEN = 10
-
-mids = %w{ ACGAGTGCGT ACGCTCGACA AGACGCACTC AGCACTGTAG ATCAGACACG
-           ATATCGCGAG CGTGTCTCTA CTCGCGTGTC TAGTATCAGC TCTCTATGCG
-           TGATACGTCT TACTGAGCTA CATAGTAGTG CGAGAGATAC ATACGACGTA
-           TCACGTACTA CGTCTAGTAC TCTACGTAGC TGTACTACTC ACGACTACAG
-           CGTAGACTAG TACGAGTATG TACTCTCGTG TAGAGACGAG TCGTCGCTCG
-           ACATACGCGT ACGCGAGTAT ACTACTATGT ACTGTACAGT AGACTATACT
-           AGCGTCGTCT AGTACGCTAT ATAGAGTACT CACGCTACGT CAGTAGACGT
-           CGACGTGACT TACACACACT TACACGTGAT TACAGATCGT TACGCTGTCT
-           TAGTGTAGAT TCGATCACGT TCGCACTAGT TCTAGCGACT TCTATACTAT
-           TGACGTATGT TGTGAGTAGT ACAGTATATA ACGCGATCGA ACTAGCAGTA
-           AGCTCACGTA AGTATACATA AGTCGAGAGA AGTGCTACGA CGATCGTATA
-           CGCAGTACGA CGCGTATACA CGTACAGTCA CGTACTCAGA CTACGCTCTA
-           CTATAGCGTA TACGTCATCA TAGTCGCATA TATATATACA TATGCTAGTA
-           TCACGCGAGA TCGATAGTGA TCGCTGCGTA TCTGACGTCA TGAGTCAGTA
-           TGTAGTGTGA TGTCACACGA TGTCGTCGCA ACACATACGC ACAGTCGTGC
-           ACATGACGAC ACGACAGCTC ACGTCTCATC ACTCATCTAC ACTCGCGCAC
-           AGAGCGTCAC AGCGACTAGC AGTAGTGATC AGTGACACAC AGTGTATGTC
-           ATAGATAGAC ATATAGTCGC ATCTACTGAC CACGTAGATC CACGTGTCGC
-           CATACTCTAC CGACACTATC CGAGACGCGC CGTATGCGAC CGTCGATCTC
-           CTACGACTGC CTAGTCACTC CTCTACGCTC CTGTACATAC TAGACTGCAC
-           TAGCGCGCGC TAGCTCTATC TATAGACATC TATGATACGC TCACTCATAC
-           TCATCGAGTC TCGAGCTCTC TCGCAGACAC TCTGTCTCGC TGAGTGACGC
-           TGATGTGTAC TGCTATAGAC TGCTCGCTAC ACGTGCAGCG ACTCACAGAG
-           AGACTCAGCG AGAGAGTGTG AGCTATCGCG AGTCTGACTG AGTGAGCTCG
-           ATAGCTCTCG ATCACGTGCG ATCGTAGCAG ATCGTCTGTG ATGTACGATG
-           ATGTGTCTAG CACACGATAG CACTCGCACG CAGACGTCTG CAGTACTGCG
-           CGACAGCGAG CGATCTGTCG CGCGTGCTAG CGCTCGAGTG CGTGATGACG
-           CTATGTACAG CTCGATATAG CTCGCACGCG CTGCGTCACG CTGTGCGTCG
-           TAGCATACTG TATACATGTG TATCACTCAG TATCTGATAG TCGTGACATG
-           TCTGATCGAG TGACATCTCG TGAGCTAGAG TGATAGAGCG TGCGTGTGCG
-           TGCTAGTCAG TGTATCACAG TGTGCGCGTG ACACGACGAC ACACGTAGTA
-           ACACTACTCG ACGACACGTA ACGAGTAGAC ACGCGTCTAG ACGTACACAC
-           ACGTACTGTG ACGTAGATCG ACTACGTCTC ACTATACGAG ACTCGCGTCG
+gsmid_hash = {
+  ACGAGTGCGT: "MID1",
+  TCTCTATGCG: "MID10",
+  TAGACTGCAC: "MID100",
+  TAGCGCGCGC: "MID101",
+  TAGCTCTATC: "MID102",
+  TATAGACATC: "MID103",
+  TATGATACGC: "MID104",
+  TCACTCATAC: "MID105",
+  TCATCGAGTC: "MID106",
+  TCGAGCTCTC: "MID107",
+  TCGCAGACAC: "MID108",
+  TCTGTCTCGC: "MID109",
+  TGATACGTCT: "MID11",
+  TGAGTGACGC: "MID110",
+  TGATGTGTAC: "MID111",
+  TGCTATAGAC: "MID112",
+  TGCTCGCTAC: "MID113",
+  ACGTGCAGCG: "MID114",
+  ACTCACAGAG: "MID115",
+  AGACTCAGCG: "MID116",
+  AGAGAGTGTG: "MID117",
+  AGCTATCGCG: "MID118",
+  AGTCTGACTG: "MID119",
+  TACTGAGCTA: "MID12",
+  AGTGAGCTCG: "MID120",
+  ATAGCTCTCG: "MID121",
+  ATCACGTGCG: "MID122",
+  ATCGTAGCAG: "MID123",
+  ATCGTCTGTG: "MID124",
+  ATGTACGATG: "MID125",
+  ATGTGTCTAG: "MID126",
+  CACACGATAG: "MID127",
+  CACTCGCACG: "MID128",
+  CAGACGTCTG: "MID129",
+  CATAGTAGTG: "MID13",
+  CAGTACTGCG: "MID130",
+  CGACAGCGAG: "MID131",
+  CGATCTGTCG: "MID132",
+  CGCGTGCTAG: "MID133",
+  CGCTCGAGTG: "MID134",
+  CGTGATGACG: "MID135",
+  CTATGTACAG: "MID136",
+  CTCGATATAG: "MID137",
+  CTCGCACGCG: "MID138",
+  CTGCGTCACG: "MID139",
+  CGAGAGATAC: "MID14",
+  CTGTGCGTCG: "MID140",
+  TAGCATACTG: "MID141",
+  TATACATGTG: "MID142",
+  TATCACTCAG: "MID143",
+  TATCTGATAG: "MID144",
+  TCGTGACATG: "MID145",
+  TCTGATCGAG: "MID146",
+  TGACATCTCG: "MID147",
+  TGAGCTAGAG: "MID148",
+  TGATAGAGCG: "MID149",
+  ATACGACGTA: "MID15",
+  TGCGTGTGCG: "MID150",
+  TGCTAGTCAG: "MID151",
+  TGTATCACAG: "MID152",
+  TGTGCGCGTG: "MID153",
+  TCACGTACTA: "MID16",
+  CGTCTAGTAC: "MID17",
+  TCTACGTAGC: "MID18",
+  TGTACTACTC: "MID19",
+  ACGCTCGACA: "MID2",
+  ACGACTACAG: "MID20",
+  CGTAGACTAG: "MID21",
+  TACGAGTATG: "MID22",
+  TACTCTCGTG: "MID23",
+  TAGAGACGAG: "MID24",
+  TCGTCGCTCG: "MID25",
+  ACATACGCGT: "MID26",
+  ACGCGAGTAT: "MID27",
+  ACTACTATGT: "MID28",
+  ACTGTACAGT: "MID29",
+  AGACGCACTC: "MID3",
+  AGACTATACT: "MID30",
+  AGCGTCGTCT: "MID31",
+  AGTACGCTAT: "MID32",
+  ATAGAGTACT: "MID33",
+  CACGCTACGT: "MID34",
+  CAGTAGACGT: "MID35",
+  CGACGTGACT: "MID36",
+  TACACACACT: "MID37",
+  TACACGTGAT: "MID38",
+  TACAGATCGT: "MID39",
+  AGCACTGTAG: "MID4",
+  TACGCTGTCT: "MID40",
+  TAGTGTAGAT: "MID41",
+  TCGATCACGT: "MID42",
+  TCGCACTAGT: "MID43",
+  TCTAGCGACT: "MID44",
+  TCTATACTAT: "MID45",
+  TGACGTATGT: "MID46",
+  TGTGAGTAGT: "MID47",
+  ACAGTATATA: "MID48",
+  ACGCGATCGA: "MID49",
+  ATCAGACACG: "MID5",
+  ACTAGCAGTA: "MID50",
+  AGCTCACGTA: "MID51",
+  AGTATACATA: "MID52",
+  AGTCGAGAGA: "MID53",
+  AGTGCTACGA: "MID54",
+  CGATCGTATA: "MID55",
+  CGCAGTACGA: "MID56",
+  CGCGTATACA: "MID57",
+  CGTACAGTCA: "MID58",
+  CGTACTCAGA: "MID59",
+  ATATCGCGAG: "MID6",
+  CTACGCTCTA: "MID60",
+  CTATAGCGTA: "MID61",
+  TACGTCATCA: "MID62",
+  TAGTCGCATA: "MID63",
+  TATATATACA: "MID64",
+  TATGCTAGTA: "MID65",
+  TCACGCGAGA: "MID66",
+  TCGATAGTGA: "MID67",
+  TCGCTGCGTA: "MID68",
+  TCTGACGTCA: "MID69",
+  CGTGTCTCTA: "MID7",
+  TGAGTCAGTA: "MID70",
+  TGTAGTGTGA: "MID71",
+  TGTCACACGA: "MID72",
+  TGTCGTCGCA: "MID73",
+  ACACATACGC: "MID74",
+  ACAGTCGTGC: "MID75",
+  ACATGACGAC: "MID76",
+  ACGACAGCTC: "MID77",
+  ACGTCTCATC: "MID78",
+  ACTCATCTAC: "MID79",
+  CTCGCGTGTC: "MID8",
+  ACTCGCGCAC: "MID80",
+  AGAGCGTCAC: "MID81",
+  AGCGACTAGC: "MID82",
+  AGTAGTGATC: "MID83",
+  AGTGACACAC: "MID84",
+  AGTGTATGTC: "MID85",
+  ATAGATAGAC: "MID86",
+  ATATAGTCGC: "MID87",
+  ATCTACTGAC: "MID88",
+  CACGTAGATC: "MID89",
+  TAGTATCAGC: "MID9",
+  CACGTGTCGC: "MID90",
+  CATACTCTAC: "MID91",
+  CGACACTATC: "MID92",
+  CGAGACGCGC: "MID93",
+  CGTATGCGAC: "MID94",
+  CGTCGATCTC: "MID95",
+  CTACGACTGC: "MID96",
+  CTAGTCACTC: "MID97",
+  CTCTACGCTC: "MID98",
+  CTGTACATAC: "MID99",
 }
 
-
-mid_hash = {}
-
-mids.each_with_index do |mid, i|
-  mid_hash[mid] = i
-end
+rlmid_hash = {
+  ACACGACGACT: "RL1",
+  ACACGTAGTAT: "RL2",
+  ACACTACTCGT: "RL3",
+  ACGACACGTAT: "RL4",
+  ACGAGTAGACT: "RL5",
+  ACGCGTCTAGT: "RL6",
+  ACGTACACACT: "RL7",
+  ACGTACTGTGT: "RL8",
+  ACGTAGATCGT: "RL9",
+  ACTACGTCTCT: "RL10",
+  ACTATACGAGT: "RL11",
+  ACTCGCGTCGT: "RL12" 
+}
 
 casts = []
-casts << {:long=>'pos', :short=>'p', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'pos',        :short=>'p', :type=>'uint', :mandatory=>false, :default=>0,   :allowed=>nil,     :disallowed=>nil}
+casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>0,   :allowed=>"0,1,2", :disallowed=>nil}
 
 options = Biopieces.options_parse(ARGV, casts)
 
-pos = options[:pos]
+pos        = options[:pos]
+mismatches = options[:mismatches]
 
 Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
+  aux_hash = {}
+
   input.each_record do |record|
     if record.has_key? :SEQ
-      tag = record[:SEQ][pos ... pos + MID_LEN].upcase
+      found_tag    = nil
+      hamming_dist = 0
+
+      tag10 = record[:SEQ][pos ... pos + 10].upcase.to_sym
+      tag11 = record[:SEQ][pos ... pos + 11].upcase.to_sym
+
+      if gsmid_hash.has_key? tag10
+        found_tag = tag10
+      elsif rlmid_hash.has_key? tag11
+        found_tag = tag11
+      end
+
+      if mismatches > 0 and found_tag.nil?
+        if aux_hash.has_key? tag10
+          found_tag = tag10
+        elsif aux_hash.has_key? tag11
+          found_tag = tag11
+        end
+
+        if found_tag.nil?
+          gsmid_hash.each_pair do |mid, mid_name|
+            hamming_dist = tag10.to_s.hamming_distance(mid.to_s)
+
+            if hamming_dist <= mismatches
+              found_tag = mid
+
+              aux_hash[tag10] = mid_name
 
-      if mid_hash[tag]
-        record[:SEQ]     = record[:SEQ][pos + MID_LEN ... record[:SEQ].length]
-        record[:SCORES]  = record[:SCORES][pos + MID_LEN ... record[:SCORES].length] if record[:SCORES]
-        record[:MID]     = tag
-        record[:MID_NUM] = mid_hash[tag] + 1
+              break
+            end
+          end
+        end
+
+        if found_tag.nil?
+          rlmid_hash.each_pair do |mid, mid_name|
+            hamming_dist = tag11.to_s.hamming_distance(mid.to_s)
+
+            if hamming_dist <= mismatches
+              found_tag = mid
+
+              aux_hash[tag11] = mid_name
+
+              break
+            end
+          end
+        end
+      end
+
+      unless found_tag.nil?
+        mid_name = (found_tag.length == 10) ? gsmid_hash[found_tag] : rlmid_hash[found_tag]
+
+        record[:MID]            = found_tag
+        record[:MID_NAME]       = mid_name
+        record[:MID_POS]        = pos
+        record[:MID_LEN]        = found_tag.length
+        record[:MID_MISMATCHES] = hamming_dist
+
+        start = pos + found_tag.length
+
+        record[:SEQ]     = record[:SEQ][start .. -1]
+        record[:SCORES]  = record[:SCORES][start .. -1] if record[:SCORES]
         record[:SEQ_LEN] = record[:SEQ].length
       end
     end
@@ -100,9 +285,9 @@ Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
     output.puts record
   end
 end
-
-
+  
+  
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-
-
+  
+  
 __END__
index 3f9eabc3506a9116028b2baec8f9a86243c71030..c8e8716f371b406e6714547b512ada23b34c8cf6 100644 (file)
@@ -1,8 +1,32 @@
-SEQ_NAME: test1
-SEQ: atcgTAGACTGCACgtcag
-SEQ_LEN: 19
+SEQ_NAME: test_RL12_0_mismatches
+SEQ: atcgACTCGCGTCGTgtgactgact
+SEQ_LEN: 25
 ---
-SEQ_NAME: test2
-SEQ: TAGACTGCACgtcag
-SEQ_LEN: 15
+SEQ_NAME: test_RL12_1_mismatches
+SEQ: atcgACTCGCcTCGTgtgactgact
+SEQ_LEN: 25
+---
+SEQ_NAME: test_RL12_2_mismatches
+SEQ: atcgACTCaCcTCGTgtgactgact
+SEQ_LEN: 25
+---
+SEQ_NAME: test_RL12_2_mismatches_repeat
+SEQ: atcgACTCaCcTCGTgtgactgact
+SEQ_LEN: 25
+---
+SEQ_NAME: test_GS99_0_mismatches
+SEQ: atcgCTGTACATACgtagtagtagt
+SEQ_LEN: 25
+---
+SEQ_NAME: test_GS99_1_mismatches
+SEQ: atcgCTGTtCATACgtagtagtagt
+SEQ_LEN: 25
+---
+SEQ_NAME: test_GS99_2_mismatches
+SEQ: atcgCTGTtCAgACgtagtagtagt
+SEQ_LEN: 25
+---
+SEQ_NAME: test_GS99_2_mismatches_repeat
+SEQ: atcgCTGTtCAgACgtagtagtagt
+SEQ_LEN: 25
 ---
index a492e519051b61e12a79b666816784f44b971a0f..c8e8716f371b406e6714547b512ada23b34c8cf6 100644 (file)
@@ -1,10 +1,32 @@
-SEQ_NAME: test1
-SEQ: atcgTAGACTGCACgtcag
-SEQ_LEN: 19
----
-SEQ_NAME: test2
-SEQ: gtcag
-SEQ_LEN: 5
-MID: TAGACTGCAC
-MID_NUM: 100
+SEQ_NAME: test_RL12_0_mismatches
+SEQ: atcgACTCGCGTCGTgtgactgact
+SEQ_LEN: 25
+---
+SEQ_NAME: test_RL12_1_mismatches
+SEQ: atcgACTCGCcTCGTgtgactgact
+SEQ_LEN: 25
+---
+SEQ_NAME: test_RL12_2_mismatches
+SEQ: atcgACTCaCcTCGTgtgactgact
+SEQ_LEN: 25
+---
+SEQ_NAME: test_RL12_2_mismatches_repeat
+SEQ: atcgACTCaCcTCGTgtgactgact
+SEQ_LEN: 25
+---
+SEQ_NAME: test_GS99_0_mismatches
+SEQ: atcgCTGTACATACgtagtagtagt
+SEQ_LEN: 25
+---
+SEQ_NAME: test_GS99_1_mismatches
+SEQ: atcgCTGTtCATACgtagtagtagt
+SEQ_LEN: 25
+---
+SEQ_NAME: test_GS99_2_mismatches
+SEQ: atcgCTGTtCAgACgtagtagtagt
+SEQ_LEN: 25
+---
+SEQ_NAME: test_GS99_2_mismatches_repeat
+SEQ: atcgCTGTtCAgACgtagtagtagt
+SEQ_LEN: 25
 ---
index bd47b2ddd3e726c78e0cc111c35d99f0287345a3..036cdbee5a9e8c6e6af920ec22c5a4b427102591 100644 (file)
@@ -1,10 +1,42 @@
-SEQ_NAME: test1
-SEQ: gtcag
-SEQ_LEN: 5
-MID: TAGACTGCAC
-MID_NUM: 100
----
-SEQ_NAME: test2
-SEQ: TAGACTGCACgtcag
-SEQ_LEN: 15
+SEQ_NAME: test_RL12_0_mismatches
+SEQ: gtgactgact
+SEQ_LEN: 10
+MID: ACTCGCGTCGT
+MID_NAME: RL12
+MID_POS: 4
+MID_LEN: 11
+MID_MISMATCHES: 0
+---
+SEQ_NAME: test_RL12_1_mismatches
+SEQ: atcgACTCGCcTCGTgtgactgact
+SEQ_LEN: 25
+---
+SEQ_NAME: test_RL12_2_mismatches
+SEQ: atcgACTCaCcTCGTgtgactgact
+SEQ_LEN: 25
+---
+SEQ_NAME: test_RL12_2_mismatches_repeat
+SEQ: atcgACTCaCcTCGTgtgactgact
+SEQ_LEN: 25
+---
+SEQ_NAME: test_GS99_0_mismatches
+SEQ: gtagtagtagt
+SEQ_LEN: 11
+MID: CTGTACATAC
+MID_NAME: MID99
+MID_POS: 4
+MID_LEN: 10
+MID_MISMATCHES: 0
+---
+SEQ_NAME: test_GS99_1_mismatches
+SEQ: atcgCTGTtCATACgtagtagtagt
+SEQ_LEN: 25
+---
+SEQ_NAME: test_GS99_2_mismatches
+SEQ: atcgCTGTtCAgACgtagtagtagt
+SEQ_LEN: 25
+---
+SEQ_NAME: test_GS99_2_mismatches_repeat
+SEQ: atcgCTGTtCAgACgtagtagtagt
+SEQ_LEN: 25
 ---
index 79717d034b7d71b9a64cdabcf7c91118b2d7890e..2179d5e90459a7471efe041d756c0aaa7fe8a955 100755 (executable)
@@ -9,3 +9,11 @@ clean
 run "$bp -I $in -p 4 -O $tmp"
 assert_no_diff $tmp $out.2
 clean
+
+run "$bp -I $in -p 4 -m 1 -O $tmp"
+assert_no_diff $tmp $out.3
+clean
+
+run "$bp -I $in -p 4 -m 2 -O $tmp"
+assert_no_diff $tmp $out.4
+clean