# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-# Find and remove MID tags in sequences in the stream.
+# Find MID tags in sequences in the stream.
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-
require 'maasha/biopieces'
+require 'maasha/bits'
require 'pp'
-MID_LEN = 10
-
-mids = %w{ ACGAGTGCGT ACGCTCGACA AGACGCACTC AGCACTGTAG ATCAGACACG
- ATATCGCGAG CGTGTCTCTA CTCGCGTGTC TAGTATCAGC TCTCTATGCG
- TGATACGTCT TACTGAGCTA CATAGTAGTG CGAGAGATAC ATACGACGTA
- TCACGTACTA CGTCTAGTAC TCTACGTAGC TGTACTACTC ACGACTACAG
- CGTAGACTAG TACGAGTATG TACTCTCGTG TAGAGACGAG TCGTCGCTCG
- ACATACGCGT ACGCGAGTAT ACTACTATGT ACTGTACAGT AGACTATACT
- AGCGTCGTCT AGTACGCTAT ATAGAGTACT CACGCTACGT CAGTAGACGT
- CGACGTGACT TACACACACT TACACGTGAT TACAGATCGT TACGCTGTCT
- TAGTGTAGAT TCGATCACGT TCGCACTAGT TCTAGCGACT TCTATACTAT
- TGACGTATGT TGTGAGTAGT ACAGTATATA ACGCGATCGA ACTAGCAGTA
- AGCTCACGTA AGTATACATA AGTCGAGAGA AGTGCTACGA CGATCGTATA
- CGCAGTACGA CGCGTATACA CGTACAGTCA CGTACTCAGA CTACGCTCTA
- CTATAGCGTA TACGTCATCA TAGTCGCATA TATATATACA TATGCTAGTA
- TCACGCGAGA TCGATAGTGA TCGCTGCGTA TCTGACGTCA TGAGTCAGTA
- TGTAGTGTGA TGTCACACGA TGTCGTCGCA ACACATACGC ACAGTCGTGC
- ACATGACGAC ACGACAGCTC ACGTCTCATC ACTCATCTAC ACTCGCGCAC
- AGAGCGTCAC AGCGACTAGC AGTAGTGATC AGTGACACAC AGTGTATGTC
- ATAGATAGAC ATATAGTCGC ATCTACTGAC CACGTAGATC CACGTGTCGC
- CATACTCTAC CGACACTATC CGAGACGCGC CGTATGCGAC CGTCGATCTC
- CTACGACTGC CTAGTCACTC CTCTACGCTC CTGTACATAC TAGACTGCAC
- TAGCGCGCGC TAGCTCTATC TATAGACATC TATGATACGC TCACTCATAC
- TCATCGAGTC TCGAGCTCTC TCGCAGACAC TCTGTCTCGC TGAGTGACGC
- TGATGTGTAC TGCTATAGAC TGCTCGCTAC ACGTGCAGCG ACTCACAGAG
- AGACTCAGCG AGAGAGTGTG AGCTATCGCG AGTCTGACTG AGTGAGCTCG
- ATAGCTCTCG ATCACGTGCG ATCGTAGCAG ATCGTCTGTG ATGTACGATG
- ATGTGTCTAG CACACGATAG CACTCGCACG CAGACGTCTG CAGTACTGCG
- CGACAGCGAG CGATCTGTCG CGCGTGCTAG CGCTCGAGTG CGTGATGACG
- CTATGTACAG CTCGATATAG CTCGCACGCG CTGCGTCACG CTGTGCGTCG
- TAGCATACTG TATACATGTG TATCACTCAG TATCTGATAG TCGTGACATG
- TCTGATCGAG TGACATCTCG TGAGCTAGAG TGATAGAGCG TGCGTGTGCG
- TGCTAGTCAG TGTATCACAG TGTGCGCGTG ACACGACGAC ACACGTAGTA
- ACACTACTCG ACGACACGTA ACGAGTAGAC ACGCGTCTAG ACGTACACAC
- ACGTACTGTG ACGTAGATCG ACTACGTCTC ACTATACGAG ACTCGCGTCG
+gsmid_hash = {
+ ACGAGTGCGT: "MID1",
+ TCTCTATGCG: "MID10",
+ TAGACTGCAC: "MID100",
+ TAGCGCGCGC: "MID101",
+ TAGCTCTATC: "MID102",
+ TATAGACATC: "MID103",
+ TATGATACGC: "MID104",
+ TCACTCATAC: "MID105",
+ TCATCGAGTC: "MID106",
+ TCGAGCTCTC: "MID107",
+ TCGCAGACAC: "MID108",
+ TCTGTCTCGC: "MID109",
+ TGATACGTCT: "MID11",
+ TGAGTGACGC: "MID110",
+ TGATGTGTAC: "MID111",
+ TGCTATAGAC: "MID112",
+ TGCTCGCTAC: "MID113",
+ ACGTGCAGCG: "MID114",
+ ACTCACAGAG: "MID115",
+ AGACTCAGCG: "MID116",
+ AGAGAGTGTG: "MID117",
+ AGCTATCGCG: "MID118",
+ AGTCTGACTG: "MID119",
+ TACTGAGCTA: "MID12",
+ AGTGAGCTCG: "MID120",
+ ATAGCTCTCG: "MID121",
+ ATCACGTGCG: "MID122",
+ ATCGTAGCAG: "MID123",
+ ATCGTCTGTG: "MID124",
+ ATGTACGATG: "MID125",
+ ATGTGTCTAG: "MID126",
+ CACACGATAG: "MID127",
+ CACTCGCACG: "MID128",
+ CAGACGTCTG: "MID129",
+ CATAGTAGTG: "MID13",
+ CAGTACTGCG: "MID130",
+ CGACAGCGAG: "MID131",
+ CGATCTGTCG: "MID132",
+ CGCGTGCTAG: "MID133",
+ CGCTCGAGTG: "MID134",
+ CGTGATGACG: "MID135",
+ CTATGTACAG: "MID136",
+ CTCGATATAG: "MID137",
+ CTCGCACGCG: "MID138",
+ CTGCGTCACG: "MID139",
+ CGAGAGATAC: "MID14",
+ CTGTGCGTCG: "MID140",
+ TAGCATACTG: "MID141",
+ TATACATGTG: "MID142",
+ TATCACTCAG: "MID143",
+ TATCTGATAG: "MID144",
+ TCGTGACATG: "MID145",
+ TCTGATCGAG: "MID146",
+ TGACATCTCG: "MID147",
+ TGAGCTAGAG: "MID148",
+ TGATAGAGCG: "MID149",
+ ATACGACGTA: "MID15",
+ TGCGTGTGCG: "MID150",
+ TGCTAGTCAG: "MID151",
+ TGTATCACAG: "MID152",
+ TGTGCGCGTG: "MID153",
+ TCACGTACTA: "MID16",
+ CGTCTAGTAC: "MID17",
+ TCTACGTAGC: "MID18",
+ TGTACTACTC: "MID19",
+ ACGCTCGACA: "MID2",
+ ACGACTACAG: "MID20",
+ CGTAGACTAG: "MID21",
+ TACGAGTATG: "MID22",
+ TACTCTCGTG: "MID23",
+ TAGAGACGAG: "MID24",
+ TCGTCGCTCG: "MID25",
+ ACATACGCGT: "MID26",
+ ACGCGAGTAT: "MID27",
+ ACTACTATGT: "MID28",
+ ACTGTACAGT: "MID29",
+ AGACGCACTC: "MID3",
+ AGACTATACT: "MID30",
+ AGCGTCGTCT: "MID31",
+ AGTACGCTAT: "MID32",
+ ATAGAGTACT: "MID33",
+ CACGCTACGT: "MID34",
+ CAGTAGACGT: "MID35",
+ CGACGTGACT: "MID36",
+ TACACACACT: "MID37",
+ TACACGTGAT: "MID38",
+ TACAGATCGT: "MID39",
+ AGCACTGTAG: "MID4",
+ TACGCTGTCT: "MID40",
+ TAGTGTAGAT: "MID41",
+ TCGATCACGT: "MID42",
+ TCGCACTAGT: "MID43",
+ TCTAGCGACT: "MID44",
+ TCTATACTAT: "MID45",
+ TGACGTATGT: "MID46",
+ TGTGAGTAGT: "MID47",
+ ACAGTATATA: "MID48",
+ ACGCGATCGA: "MID49",
+ ATCAGACACG: "MID5",
+ ACTAGCAGTA: "MID50",
+ AGCTCACGTA: "MID51",
+ AGTATACATA: "MID52",
+ AGTCGAGAGA: "MID53",
+ AGTGCTACGA: "MID54",
+ CGATCGTATA: "MID55",
+ CGCAGTACGA: "MID56",
+ CGCGTATACA: "MID57",
+ CGTACAGTCA: "MID58",
+ CGTACTCAGA: "MID59",
+ ATATCGCGAG: "MID6",
+ CTACGCTCTA: "MID60",
+ CTATAGCGTA: "MID61",
+ TACGTCATCA: "MID62",
+ TAGTCGCATA: "MID63",
+ TATATATACA: "MID64",
+ TATGCTAGTA: "MID65",
+ TCACGCGAGA: "MID66",
+ TCGATAGTGA: "MID67",
+ TCGCTGCGTA: "MID68",
+ TCTGACGTCA: "MID69",
+ CGTGTCTCTA: "MID7",
+ TGAGTCAGTA: "MID70",
+ TGTAGTGTGA: "MID71",
+ TGTCACACGA: "MID72",
+ TGTCGTCGCA: "MID73",
+ ACACATACGC: "MID74",
+ ACAGTCGTGC: "MID75",
+ ACATGACGAC: "MID76",
+ ACGACAGCTC: "MID77",
+ ACGTCTCATC: "MID78",
+ ACTCATCTAC: "MID79",
+ CTCGCGTGTC: "MID8",
+ ACTCGCGCAC: "MID80",
+ AGAGCGTCAC: "MID81",
+ AGCGACTAGC: "MID82",
+ AGTAGTGATC: "MID83",
+ AGTGACACAC: "MID84",
+ AGTGTATGTC: "MID85",
+ ATAGATAGAC: "MID86",
+ ATATAGTCGC: "MID87",
+ ATCTACTGAC: "MID88",
+ CACGTAGATC: "MID89",
+ TAGTATCAGC: "MID9",
+ CACGTGTCGC: "MID90",
+ CATACTCTAC: "MID91",
+ CGACACTATC: "MID92",
+ CGAGACGCGC: "MID93",
+ CGTATGCGAC: "MID94",
+ CGTCGATCTC: "MID95",
+ CTACGACTGC: "MID96",
+ CTAGTCACTC: "MID97",
+ CTCTACGCTC: "MID98",
+ CTGTACATAC: "MID99",
}
-
-mid_hash = {}
-
-mids.each_with_index do |mid, i|
- mid_hash[mid] = i
-end
+rlmid_hash = {
+ ACACGACGACT: "RL1",
+ ACACGTAGTAT: "RL2",
+ ACACTACTCGT: "RL3",
+ ACGACACGTAT: "RL4",
+ ACGAGTAGACT: "RL5",
+ ACGCGTCTAGT: "RL6",
+ ACGTACACACT: "RL7",
+ ACGTACTGTGT: "RL8",
+ ACGTAGATCGT: "RL9",
+ ACTACGTCTCT: "RL10",
+ ACTATACGAGT: "RL11",
+ ACTCGCGTCGT: "RL12"
+}
casts = []
-casts << {:long=>'pos', :short=>'p', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'pos', :short=>'p', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>"0,1,2", :disallowed=>nil}
options = Biopieces.options_parse(ARGV, casts)
-pos = options[:pos]
+pos = options[:pos]
+mismatches = options[:mismatches]
Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
+ aux_hash = {}
+
input.each_record do |record|
if record.has_key? :SEQ
- tag = record[:SEQ][pos ... pos + MID_LEN].upcase
+ found_tag = nil
+ hamming_dist = 0
+
+ tag10 = record[:SEQ][pos ... pos + 10].upcase.to_sym
+ tag11 = record[:SEQ][pos ... pos + 11].upcase.to_sym
+
+ if gsmid_hash.has_key? tag10
+ found_tag = tag10
+ elsif rlmid_hash.has_key? tag11
+ found_tag = tag11
+ end
+
+ if mismatches > 0 and found_tag.nil?
+ if aux_hash.has_key? tag10
+ found_tag = tag10
+ elsif aux_hash.has_key? tag11
+ found_tag = tag11
+ end
+
+ if found_tag.nil?
+ gsmid_hash.each_pair do |mid, mid_name|
+ hamming_dist = tag10.to_s.hamming_distance(mid.to_s)
+
+ if hamming_dist <= mismatches
+ found_tag = mid
+
+ aux_hash[tag10] = mid_name
- if mid_hash[tag]
- record[:SEQ] = record[:SEQ][pos + MID_LEN ... record[:SEQ].length]
- record[:SCORES] = record[:SCORES][pos + MID_LEN ... record[:SCORES].length] if record[:SCORES]
- record[:MID] = tag
- record[:MID_NUM] = mid_hash[tag] + 1
+ break
+ end
+ end
+ end
+
+ if found_tag.nil?
+ rlmid_hash.each_pair do |mid, mid_name|
+ hamming_dist = tag11.to_s.hamming_distance(mid.to_s)
+
+ if hamming_dist <= mismatches
+ found_tag = mid
+
+ aux_hash[tag11] = mid_name
+
+ break
+ end
+ end
+ end
+ end
+
+ unless found_tag.nil?
+ mid_name = (found_tag.length == 10) ? gsmid_hash[found_tag] : rlmid_hash[found_tag]
+
+ record[:MID] = found_tag
+ record[:MID_NAME] = mid_name
+ record[:MID_POS] = pos
+ record[:MID_LEN] = found_tag.length
+ record[:MID_MISMATCHES] = hamming_dist
+
+ start = pos + found_tag.length
+
+ record[:SEQ] = record[:SEQ][start .. -1]
+ record[:SCORES] = record[:SCORES][start .. -1] if record[:SCORES]
record[:SEQ_LEN] = record[:SEQ].length
end
end
output.puts record
end
end
-
-
+
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-
-
+
+
__END__