X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bp_bin%2Fremove_mids;h=c73d23ceec86f5fbaf51555e94402ab91c4a49a3;hb=017a7e74ba0757d00197643926536cd36ff88357;hp=c42a6c0d6cdda5f5f448450065d50faad028de2a;hpb=8dd2d998276e75b3ccda604a1b4c04eaf9bdcab3;p=biopieces.git diff --git a/bp_bin/remove_mids b/bp_bin/remove_mids index c42a6c0..c73d23c 100755 --- a/bp_bin/remove_mids +++ b/bp_bin/remove_mids @@ -24,83 +24,286 @@ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -# Find and remove MID tags in sequences in the stream. +# Find MID tags in sequences in the stream. # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -require 'biopieces' +require 'maasha/biopieces' +require 'maasha/bits' require 'pp' -MID_LEN = 10 - -mids = %w{ ACGAGTGCGT ACGCTCGACA AGACGCACTC AGCACTGTAG ATCAGACACG - ATATCGCGAG CGTGTCTCTA CTCGCGTGTC TAGTATCAGC TCTCTATGCG - TGATACGTCT TACTGAGCTA CATAGTAGTG CGAGAGATAC ATACGACGTA - TCACGTACTA CGTCTAGTAC TCTACGTAGC TGTACTACTC ACGACTACAG - CGTAGACTAG TACGAGTATG TACTCTCGTG TAGAGACGAG TCGTCGCTCG - ACATACGCGT ACGCGAGTAT ACTACTATGT ACTGTACAGT AGACTATACT - AGCGTCGTCT AGTACGCTAT ATAGAGTACT CACGCTACGT CAGTAGACGT - CGACGTGACT TACACACACT TACACGTGAT TACAGATCGT TACGCTGTCT - TAGTGTAGAT TCGATCACGT TCGCACTAGT TCTAGCGACT TCTATACTAT - TGACGTATGT TGTGAGTAGT ACAGTATATA ACGCGATCGA ACTAGCAGTA - AGCTCACGTA AGTATACATA AGTCGAGAGA AGTGCTACGA CGATCGTATA - CGCAGTACGA CGCGTATACA CGTACAGTCA CGTACTCAGA CTACGCTCTA - CTATAGCGTA TACGTCATCA TAGTCGCATA TATATATACA TATGCTAGTA - TCACGCGAGA TCGATAGTGA TCGCTGCGTA TCTGACGTCA TGAGTCAGTA - TGTAGTGTGA TGTCACACGA TGTCGTCGCA ACACATACGC ACAGTCGTGC - ACATGACGAC ACGACAGCTC ACGTCTCATC ACTCATCTAC ACTCGCGCAC - AGAGCGTCAC AGCGACTAGC AGTAGTGATC AGTGACACAC AGTGTATGTC - ATAGATAGAC ATATAGTCGC ATCTACTGAC CACGTAGATC CACGTGTCGC - CATACTCTAC CGACACTATC CGAGACGCGC CGTATGCGAC CGTCGATCTC - CTACGACTGC CTAGTCACTC CTCTACGCTC CTGTACATAC TAGACTGCAC - TAGCGCGCGC TAGCTCTATC TATAGACATC TATGATACGC TCACTCATAC - TCATCGAGTC TCGAGCTCTC TCGCAGACAC TCTGTCTCGC TGAGTGACGC - TGATGTGTAC TGCTATAGAC TGCTCGCTAC ACGTGCAGCG ACTCACAGAG - AGACTCAGCG AGAGAGTGTG AGCTATCGCG AGTCTGACTG AGTGAGCTCG - ATAGCTCTCG ATCACGTGCG ATCGTAGCAG ATCGTCTGTG ATGTACGATG - ATGTGTCTAG CACACGATAG CACTCGCACG CAGACGTCTG CAGTACTGCG - CGACAGCGAG CGATCTGTCG CGCGTGCTAG CGCTCGAGTG CGTGATGACG - CTATGTACAG CTCGATATAG CTCGCACGCG CTGCGTCACG CTGTGCGTCG - TAGCATACTG TATACATGTG TATCACTCAG TATCTGATAG TCGTGACATG - TCTGATCGAG TGACATCTCG TGAGCTAGAG TGATAGAGCG TGCGTGTGCG - TGCTAGTCAG TGTATCACAG TGTGCGCGTG ACACGACGAC ACACGTAGTA - ACACTACTCG ACGACACGTA ACGAGTAGAC ACGCGTCTAG ACGTACACAC - ACGTACTGTG ACGTAGATCG ACTACGTCTC ACTATACGAG ACTCGCGTCG +GS_SIZE = 10 +RL_SIZE = 11 + +GSMID_HASH = { + ACGAGTGCGT: "MID1", + TCTCTATGCG: "MID10", + TAGACTGCAC: "MID100", + TAGCGCGCGC: "MID101", + TAGCTCTATC: "MID102", + TATAGACATC: "MID103", + TATGATACGC: "MID104", + TCACTCATAC: "MID105", + TCATCGAGTC: "MID106", + TCGAGCTCTC: "MID107", + TCGCAGACAC: "MID108", + TCTGTCTCGC: "MID109", + TGATACGTCT: "MID11", + TGAGTGACGC: "MID110", + TGATGTGTAC: "MID111", + TGCTATAGAC: "MID112", + TGCTCGCTAC: "MID113", + ACGTGCAGCG: "MID114", + ACTCACAGAG: "MID115", + AGACTCAGCG: "MID116", + AGAGAGTGTG: "MID117", + AGCTATCGCG: "MID118", + AGTCTGACTG: "MID119", + TACTGAGCTA: "MID12", + AGTGAGCTCG: "MID120", + ATAGCTCTCG: "MID121", + ATCACGTGCG: "MID122", + ATCGTAGCAG: "MID123", + ATCGTCTGTG: "MID124", + ATGTACGATG: "MID125", + ATGTGTCTAG: "MID126", + CACACGATAG: "MID127", + CACTCGCACG: "MID128", + CAGACGTCTG: "MID129", + CATAGTAGTG: "MID13", + CAGTACTGCG: "MID130", + CGACAGCGAG: "MID131", + CGATCTGTCG: "MID132", + CGCGTGCTAG: "MID133", + CGCTCGAGTG: "MID134", + CGTGATGACG: "MID135", + CTATGTACAG: "MID136", + CTCGATATAG: "MID137", + CTCGCACGCG: "MID138", + CTGCGTCACG: "MID139", + CGAGAGATAC: "MID14", + CTGTGCGTCG: "MID140", + TAGCATACTG: "MID141", + TATACATGTG: "MID142", + TATCACTCAG: "MID143", + TATCTGATAG: "MID144", + TCGTGACATG: "MID145", + TCTGATCGAG: "MID146", + TGACATCTCG: "MID147", + TGAGCTAGAG: "MID148", + TGATAGAGCG: "MID149", + ATACGACGTA: "MID15", + TGCGTGTGCG: "MID150", + TGCTAGTCAG: "MID151", + TGTATCACAG: "MID152", + TGTGCGCGTG: "MID153", + TCACGTACTA: "MID16", + CGTCTAGTAC: "MID17", + TCTACGTAGC: "MID18", + TGTACTACTC: "MID19", + ACGCTCGACA: "MID2", + ACGACTACAG: "MID20", + CGTAGACTAG: "MID21", + TACGAGTATG: "MID22", + TACTCTCGTG: "MID23", + TAGAGACGAG: "MID24", + TCGTCGCTCG: "MID25", + ACATACGCGT: "MID26", + ACGCGAGTAT: "MID27", + ACTACTATGT: "MID28", + ACTGTACAGT: "MID29", + AGACGCACTC: "MID3", + AGACTATACT: "MID30", + AGCGTCGTCT: "MID31", + AGTACGCTAT: "MID32", + ATAGAGTACT: "MID33", + CACGCTACGT: "MID34", + CAGTAGACGT: "MID35", + CGACGTGACT: "MID36", + TACACACACT: "MID37", + TACACGTGAT: "MID38", + TACAGATCGT: "MID39", + AGCACTGTAG: "MID4", + TACGCTGTCT: "MID40", + TAGTGTAGAT: "MID41", + TCGATCACGT: "MID42", + TCGCACTAGT: "MID43", + TCTAGCGACT: "MID44", + TCTATACTAT: "MID45", + TGACGTATGT: "MID46", + TGTGAGTAGT: "MID47", + ACAGTATATA: "MID48", + ACGCGATCGA: "MID49", + ATCAGACACG: "MID5", + ACTAGCAGTA: "MID50", + AGCTCACGTA: "MID51", + AGTATACATA: "MID52", + AGTCGAGAGA: "MID53", + AGTGCTACGA: "MID54", + CGATCGTATA: "MID55", + CGCAGTACGA: "MID56", + CGCGTATACA: "MID57", + CGTACAGTCA: "MID58", + CGTACTCAGA: "MID59", + ATATCGCGAG: "MID6", + CTACGCTCTA: "MID60", + CTATAGCGTA: "MID61", + TACGTCATCA: "MID62", + TAGTCGCATA: "MID63", + TATATATACA: "MID64", + TATGCTAGTA: "MID65", + TCACGCGAGA: "MID66", + TCGATAGTGA: "MID67", + TCGCTGCGTA: "MID68", + TCTGACGTCA: "MID69", + CGTGTCTCTA: "MID7", + TGAGTCAGTA: "MID70", + TGTAGTGTGA: "MID71", + TGTCACACGA: "MID72", + TGTCGTCGCA: "MID73", + ACACATACGC: "MID74", + ACAGTCGTGC: "MID75", + ACATGACGAC: "MID76", + ACGACAGCTC: "MID77", + ACGTCTCATC: "MID78", + ACTCATCTAC: "MID79", + CTCGCGTGTC: "MID8", + ACTCGCGCAC: "MID80", + AGAGCGTCAC: "MID81", + AGCGACTAGC: "MID82", + AGTAGTGATC: "MID83", + AGTGACACAC: "MID84", + AGTGTATGTC: "MID85", + ATAGATAGAC: "MID86", + ATATAGTCGC: "MID87", + ATCTACTGAC: "MID88", + CACGTAGATC: "MID89", + TAGTATCAGC: "MID9", + CACGTGTCGC: "MID90", + CATACTCTAC: "MID91", + CGACACTATC: "MID92", + CGAGACGCGC: "MID93", + CGTATGCGAC: "MID94", + CGTCGATCTC: "MID95", + CTACGACTGC: "MID96", + CTAGTCACTC: "MID97", + CTCTACGCTC: "MID98", + CTGTACATAC: "MID99", } +RLMID_HASH = { + ACACGACGACT: "RL1", + ACACGTAGTAT: "RL2", + ACACTACTCGT: "RL3", + ACGACACGTAT: "RL4", + ACGAGTAGACT: "RL5", + ACGCGTCTAGT: "RL6", + ACGTACACACT: "RL7", + ACGTACTGTGT: "RL8", + ACGTAGATCGT: "RL9", + ACTACGTCTCT: "RL10", + ACTATACGAGT: "RL11", + ACTCGCGTCGT: "RL12" +} -mid_hash = {} +class MIDfinder + def initialize(mid_hash, pos, size, max_mismatches) + @mid_hash = mid_hash + @pos = pos + @size = size + @max_mismatches = max_mismatches + end -mids.each_with_index do |mid, i| - mid_hash[mid] = i -end + def find_mid(seq) + hamming_dist = 0 + tag = seq[@pos ... @pos + @size].upcase.to_sym -casts = [] -casts << {:long=>'pos', :short=>'p', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>nil, :disallowed=>nil} + if @mid_hash.has_key? tag + return MID.new(tag, @mid_hash[tag], @pos, @size, hamming_dist) + elsif @max_mismatches > 0 + @mid_hash.each_pair do |mid, mid_name| + hamming_dist = tag.to_s.hamming_distance(mid.to_s) -bp = Biopieces.new + if hamming_dist <= @max_mismatches + return MID.new(mid, @mid_hash[mid], @pos, @size, hamming_dist) + end + end + end -options = bp.parse(ARGV, casts) + nil + end + + private + + # Class to hold a MID object. + class MID + attr_accessor :tag, :name, :pos, :len, :mismatches -pos = options[:pos] + def initialize(tag, name, pos, len, mismatches) + @tag = tag + @name = name + @pos = pos + @len = len + @mismatches = mismatches + end -bp.each_record do |record| - if record.has_key? :SEQ - tag = record[:SEQ][pos ... pos + MID_LEN].upcase + def to_hash + hash = {} + hash[:MID] = @tag + hash[:MID_NAME] = @name + hash[:MID_POS] = @pos + hash[:MID_LEN] = @len + hash[:MID_MISMATCHES] = @mismatches + hash + end - if mid_hash[tag] - record[:SEQ] = record[:SEQ][pos + MID_LEN ... record[:SEQ].length] - record[:SCORES] = record[:SCORES][pos + MID_LEN ... record[:SCORES].length] if record[:SCORES] - record[:MID] = tag - record[:MID_NUM] = mid_hash[tag] + 1 - record[:SEQ_LEN] = record[:SEQ].length + def start + @pos + @len end end +end +casts = [] +casts << {:long=>'pos', :short=>'p', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>"0,1,2", :disallowed=>nil} +casts << {:long=>'gsmids', :short=>'g', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'rlmids', :short=>'r', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} + +options = Biopieces.options_parse(ARGV, casts) + +raise "Can't remove only GSMIDs and only RLMIDs simultaniously." if options[:rlmids] and options[:gsmids] + +pos = options[:pos] +mismatches = options[:mismatches] + +gsfinder = MIDfinder.new(GSMID_HASH, options[:pos], GS_SIZE, options[:mismatches]) +rlfinder = MIDfinder.new(RLMID_HASH, options[:pos], RL_SIZE, options[:mismatches]) + +Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| + input.each_record do |record| + if record.has_key? :SEQ - bp.puts record + if options[:gsmids] + mid = gsfinder.find_mid(record[:SEQ]) + elsif options[:rlmids] + mid = rlfinder.find_mid(record[:SEQ]) + else + mid = gsfinder.find_mid(record[:SEQ]) || rlfinder.find_mid(record[:SEQ]) + end + + if mid + record.merge!(mid.to_hash) + + record[:SEQ] = record[:SEQ][mid.start .. -1] + record[:SCORES] = record[:SCORES][mid.start .. -1] if record[:SCORES] + record[:SEQ_LEN] = record[:SEQ].length + end + end + + output.puts record + end end + # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<