# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-# Find and remove MID tags in sequences in the stream.
+# Find MID tags in sequences in the stream.
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-
-require 'biopieces'
+require 'maasha/biopieces'
+require 'maasha/bits'
require 'pp'
-MID_LEN = 10
-
-mids = %w{ ACGAGTGCGT ACGCTCGACA AGACGCACTC AGCACTGTAG ATCAGACACG
- ATATCGCGAG CGTGTCTCTA CTCGCGTGTC TAGTATCAGC TCTCTATGCG
- TGATACGTCT TACTGAGCTA CATAGTAGTG CGAGAGATAC ATACGACGTA
- TCACGTACTA CGTCTAGTAC TCTACGTAGC TGTACTACTC ACGACTACAG
- CGTAGACTAG TACGAGTATG TACTCTCGTG TAGAGACGAG TCGTCGCTCG
- ACATACGCGT ACGCGAGTAT ACTACTATGT ACTGTACAGT AGACTATACT
- AGCGTCGTCT AGTACGCTAT ATAGAGTACT CACGCTACGT CAGTAGACGT
- CGACGTGACT TACACACACT TACACGTGAT TACAGATCGT TACGCTGTCT
- TAGTGTAGAT TCGATCACGT TCGCACTAGT TCTAGCGACT TCTATACTAT
- TGACGTATGT TGTGAGTAGT ACAGTATATA ACGCGATCGA ACTAGCAGTA
- AGCTCACGTA AGTATACATA AGTCGAGAGA AGTGCTACGA CGATCGTATA
- CGCAGTACGA CGCGTATACA CGTACAGTCA CGTACTCAGA CTACGCTCTA
- CTATAGCGTA TACGTCATCA TAGTCGCATA TATATATACA TATGCTAGTA
- TCACGCGAGA TCGATAGTGA TCGCTGCGTA TCTGACGTCA TGAGTCAGTA
- TGTAGTGTGA TGTCACACGA TGTCGTCGCA ACACATACGC ACAGTCGTGC
- ACATGACGAC ACGACAGCTC ACGTCTCATC ACTCATCTAC ACTCGCGCAC
- AGAGCGTCAC AGCGACTAGC AGTAGTGATC AGTGACACAC AGTGTATGTC
- ATAGATAGAC ATATAGTCGC ATCTACTGAC CACGTAGATC CACGTGTCGC
- CATACTCTAC CGACACTATC CGAGACGCGC CGTATGCGAC CGTCGATCTC
- CTACGACTGC CTAGTCACTC CTCTACGCTC CTGTACATAC TAGACTGCAC
- TAGCGCGCGC TAGCTCTATC TATAGACATC TATGATACGC TCACTCATAC
- TCATCGAGTC TCGAGCTCTC TCGCAGACAC TCTGTCTCGC TGAGTGACGC
- TGATGTGTAC TGCTATAGAC TGCTCGCTAC ACGTGCAGCG ACTCACAGAG
- AGACTCAGCG AGAGAGTGTG AGCTATCGCG AGTCTGACTG AGTGAGCTCG
- ATAGCTCTCG ATCACGTGCG ATCGTAGCAG ATCGTCTGTG ATGTACGATG
- ATGTGTCTAG CACACGATAG CACTCGCACG CAGACGTCTG CAGTACTGCG
- CGACAGCGAG CGATCTGTCG CGCGTGCTAG CGCTCGAGTG CGTGATGACG
- CTATGTACAG CTCGATATAG CTCGCACGCG CTGCGTCACG CTGTGCGTCG
- TAGCATACTG TATACATGTG TATCACTCAG TATCTGATAG TCGTGACATG
- TCTGATCGAG TGACATCTCG TGAGCTAGAG TGATAGAGCG TGCGTGTGCG
- TGCTAGTCAG TGTATCACAG TGTGCGCGTG ACACGACGAC ACACGTAGTA
- ACACTACTCG ACGACACGTA ACGAGTAGAC ACGCGTCTAG ACGTACACAC
- ACGTACTGTG ACGTAGATCG ACTACGTCTC ACTATACGAG ACTCGCGTCG
+GS_SIZE = 10
+RL_SIZE = 11
+
+GSMID_HASH = {
+ ACGAGTGCGT: "MID1",
+ TCTCTATGCG: "MID10",
+ TAGACTGCAC: "MID100",
+ TAGCGCGCGC: "MID101",
+ TAGCTCTATC: "MID102",
+ TATAGACATC: "MID103",
+ TATGATACGC: "MID104",
+ TCACTCATAC: "MID105",
+ TCATCGAGTC: "MID106",
+ TCGAGCTCTC: "MID107",
+ TCGCAGACAC: "MID108",
+ TCTGTCTCGC: "MID109",
+ TGATACGTCT: "MID11",
+ TGAGTGACGC: "MID110",
+ TGATGTGTAC: "MID111",
+ TGCTATAGAC: "MID112",
+ TGCTCGCTAC: "MID113",
+ ACGTGCAGCG: "MID114",
+ ACTCACAGAG: "MID115",
+ AGACTCAGCG: "MID116",
+ AGAGAGTGTG: "MID117",
+ AGCTATCGCG: "MID118",
+ AGTCTGACTG: "MID119",
+ TACTGAGCTA: "MID12",
+ AGTGAGCTCG: "MID120",
+ ATAGCTCTCG: "MID121",
+ ATCACGTGCG: "MID122",
+ ATCGTAGCAG: "MID123",
+ ATCGTCTGTG: "MID124",
+ ATGTACGATG: "MID125",
+ ATGTGTCTAG: "MID126",
+ CACACGATAG: "MID127",
+ CACTCGCACG: "MID128",
+ CAGACGTCTG: "MID129",
+ CATAGTAGTG: "MID13",
+ CAGTACTGCG: "MID130",
+ CGACAGCGAG: "MID131",
+ CGATCTGTCG: "MID132",
+ CGCGTGCTAG: "MID133",
+ CGCTCGAGTG: "MID134",
+ CGTGATGACG: "MID135",
+ CTATGTACAG: "MID136",
+ CTCGATATAG: "MID137",
+ CTCGCACGCG: "MID138",
+ CTGCGTCACG: "MID139",
+ CGAGAGATAC: "MID14",
+ CTGTGCGTCG: "MID140",
+ TAGCATACTG: "MID141",
+ TATACATGTG: "MID142",
+ TATCACTCAG: "MID143",
+ TATCTGATAG: "MID144",
+ TCGTGACATG: "MID145",
+ TCTGATCGAG: "MID146",
+ TGACATCTCG: "MID147",
+ TGAGCTAGAG: "MID148",
+ TGATAGAGCG: "MID149",
+ ATACGACGTA: "MID15",
+ TGCGTGTGCG: "MID150",
+ TGCTAGTCAG: "MID151",
+ TGTATCACAG: "MID152",
+ TGTGCGCGTG: "MID153",
+ TCACGTACTA: "MID16",
+ CGTCTAGTAC: "MID17",
+ TCTACGTAGC: "MID18",
+ TGTACTACTC: "MID19",
+ ACGCTCGACA: "MID2",
+ ACGACTACAG: "MID20",
+ CGTAGACTAG: "MID21",
+ TACGAGTATG: "MID22",
+ TACTCTCGTG: "MID23",
+ TAGAGACGAG: "MID24",
+ TCGTCGCTCG: "MID25",
+ ACATACGCGT: "MID26",
+ ACGCGAGTAT: "MID27",
+ ACTACTATGT: "MID28",
+ ACTGTACAGT: "MID29",
+ AGACGCACTC: "MID3",
+ AGACTATACT: "MID30",
+ AGCGTCGTCT: "MID31",
+ AGTACGCTAT: "MID32",
+ ATAGAGTACT: "MID33",
+ CACGCTACGT: "MID34",
+ CAGTAGACGT: "MID35",
+ CGACGTGACT: "MID36",
+ TACACACACT: "MID37",
+ TACACGTGAT: "MID38",
+ TACAGATCGT: "MID39",
+ AGCACTGTAG: "MID4",
+ TACGCTGTCT: "MID40",
+ TAGTGTAGAT: "MID41",
+ TCGATCACGT: "MID42",
+ TCGCACTAGT: "MID43",
+ TCTAGCGACT: "MID44",
+ TCTATACTAT: "MID45",
+ TGACGTATGT: "MID46",
+ TGTGAGTAGT: "MID47",
+ ACAGTATATA: "MID48",
+ ACGCGATCGA: "MID49",
+ ATCAGACACG: "MID5",
+ ACTAGCAGTA: "MID50",
+ AGCTCACGTA: "MID51",
+ AGTATACATA: "MID52",
+ AGTCGAGAGA: "MID53",
+ AGTGCTACGA: "MID54",
+ CGATCGTATA: "MID55",
+ CGCAGTACGA: "MID56",
+ CGCGTATACA: "MID57",
+ CGTACAGTCA: "MID58",
+ CGTACTCAGA: "MID59",
+ ATATCGCGAG: "MID6",
+ CTACGCTCTA: "MID60",
+ CTATAGCGTA: "MID61",
+ TACGTCATCA: "MID62",
+ TAGTCGCATA: "MID63",
+ TATATATACA: "MID64",
+ TATGCTAGTA: "MID65",
+ TCACGCGAGA: "MID66",
+ TCGATAGTGA: "MID67",
+ TCGCTGCGTA: "MID68",
+ TCTGACGTCA: "MID69",
+ CGTGTCTCTA: "MID7",
+ TGAGTCAGTA: "MID70",
+ TGTAGTGTGA: "MID71",
+ TGTCACACGA: "MID72",
+ TGTCGTCGCA: "MID73",
+ ACACATACGC: "MID74",
+ ACAGTCGTGC: "MID75",
+ ACATGACGAC: "MID76",
+ ACGACAGCTC: "MID77",
+ ACGTCTCATC: "MID78",
+ ACTCATCTAC: "MID79",
+ CTCGCGTGTC: "MID8",
+ ACTCGCGCAC: "MID80",
+ AGAGCGTCAC: "MID81",
+ AGCGACTAGC: "MID82",
+ AGTAGTGATC: "MID83",
+ AGTGACACAC: "MID84",
+ AGTGTATGTC: "MID85",
+ ATAGATAGAC: "MID86",
+ ATATAGTCGC: "MID87",
+ ATCTACTGAC: "MID88",
+ CACGTAGATC: "MID89",
+ TAGTATCAGC: "MID9",
+ CACGTGTCGC: "MID90",
+ CATACTCTAC: "MID91",
+ CGACACTATC: "MID92",
+ CGAGACGCGC: "MID93",
+ CGTATGCGAC: "MID94",
+ CGTCGATCTC: "MID95",
+ CTACGACTGC: "MID96",
+ CTAGTCACTC: "MID97",
+ CTCTACGCTC: "MID98",
+ CTGTACATAC: "MID99",
}
+RLMID_HASH = {
+ ACACGACGACT: "RL1",
+ ACACGTAGTAT: "RL2",
+ ACACTACTCGT: "RL3",
+ ACGACACGTAT: "RL4",
+ ACGAGTAGACT: "RL5",
+ ACGCGTCTAGT: "RL6",
+ ACGTACACACT: "RL7",
+ ACGTACTGTGT: "RL8",
+ ACGTAGATCGT: "RL9",
+ ACTACGTCTCT: "RL10",
+ ACTATACGAGT: "RL11",
+ ACTCGCGTCGT: "RL12"
+}
-mid_hash = {}
+class MIDfinder
+ def initialize(mid_hash, pos, size, max_mismatches)
+ @mid_hash = mid_hash
+ @pos = pos
+ @size = size
+ @max_mismatches = max_mismatches
+ end
-mids.each_with_index do |mid, i|
- mid_hash[mid] = i
-end
+ def find_mid(seq)
+ hamming_dist = 0
+ tag = seq[@pos ... @pos + @size].upcase.to_sym
-casts = []
-casts << {:long=>'pos', :short=>'p', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>nil, :disallowed=>nil}
+ if @mid_hash.has_key? tag
+ return MID.new(tag, @mid_hash[tag], @pos, @size, hamming_dist)
+ elsif @max_mismatches > 0
+ @mid_hash.each_pair do |mid, mid_name|
+ hamming_dist = tag.to_s.hamming_distance(mid.to_s)
-bp = Biopieces.new
+ if hamming_dist <= @max_mismatches
+ return MID.new(mid, @mid_hash[mid], @pos, @size, hamming_dist)
+ end
+ end
+ end
-options = bp.parse(ARGV, casts)
+ nil
+ end
+
+ private
+
+ # Class to hold a MID object.
+ class MID
+ attr_accessor :tag, :name, :pos, :len, :mismatches
-pos = options[:pos]
+ def initialize(tag, name, pos, len, mismatches)
+ @tag = tag
+ @name = name
+ @pos = pos
+ @len = len
+ @mismatches = mismatches
+ end
-bp.each_record do |record|
- if record.has_key? :SEQ
- tag = record[:SEQ][pos ... pos + MID_LEN].upcase
+ def to_hash
+ hash = {}
+ hash[:MID] = @tag
+ hash[:MID_NAME] = @name
+ hash[:MID_POS] = @pos
+ hash[:MID_LEN] = @len
+ hash[:MID_MISMATCHES] = @mismatches
+ hash
+ end
- if mid_hash[tag]
- record[:SEQ] = record[:SEQ][pos + MID_LEN ... record[:SEQ].length]
- record[:SCORES] = record[:SCORES][pos + MID_LEN ... record[:SCORES].length] if record[:SCORES]
- record[:MID] = tag
- record[:MID_NUM] = mid_hash[tag] + 1
- record[:SEQ_LEN] = record[:SEQ].length
+ def start
+ @pos + @len
end
end
+end
+casts = []
+casts << {:long=>'pos', :short=>'p', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>"0,1,2", :disallowed=>nil}
+casts << {:long=>'gsmids', :short=>'g', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'rlmids', :short=>'r', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+
+options = Biopieces.options_parse(ARGV, casts)
+
+raise "Can't remove only GSMIDs and only RLMIDs simultaniously." if options[:rlmids] and options[:gsmids]
+
+pos = options[:pos]
+mismatches = options[:mismatches]
+
+gsfinder = MIDfinder.new(GSMID_HASH, options[:pos], GS_SIZE, options[:mismatches])
+rlfinder = MIDfinder.new(RLMID_HASH, options[:pos], RL_SIZE, options[:mismatches])
+
+Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
+ input.each_record do |record|
+ if record.has_key? :SEQ
- bp.puts record
+ if options[:gsmids]
+ mid = gsfinder.find_mid(record[:SEQ])
+ elsif options[:rlmids]
+ mid = rlfinder.find_mid(record[:SEQ])
+ else
+ mid = gsfinder.find_mid(record[:SEQ]) || rlfinder.find_mid(record[:SEQ])
+ end
+
+ if mid
+ record.merge!(mid.to_hash)
+
+ record[:SEQ] = record[:SEQ][mid.start .. -1]
+ record[:SCORES] = record[:SCORES][mid.start .. -1] if record[:SCORES]
+ record[:SEQ_LEN] = record[:SEQ].length
+ end
+ end
+
+ output.puts record
+ end
end
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<