]> git.donarmstrong.com Git - biopieces.git/blobdiff - bp_bin/remove_mids
speed optimized interval_set and interval_unset in bitarray.rb - now evil fast
[biopieces.git] / bp_bin / remove_mids
index c42a6c0d6cdda5f5f448450065d50faad028de2a..c73d23ceec86f5fbaf51555e94402ab91c4a49a3 100755 (executable)
 
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
-# Find and remove MID tags in sequences in the stream.
+# Find MID tags in sequences in the stream.
 
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
-
-require 'biopieces'
+require 'maasha/biopieces'
+require 'maasha/bits'
 require 'pp'
 
-MID_LEN = 10
-
-mids = %w{ ACGAGTGCGT ACGCTCGACA AGACGCACTC AGCACTGTAG ATCAGACACG
-           ATATCGCGAG CGTGTCTCTA CTCGCGTGTC TAGTATCAGC TCTCTATGCG
-           TGATACGTCT TACTGAGCTA CATAGTAGTG CGAGAGATAC ATACGACGTA
-           TCACGTACTA CGTCTAGTAC TCTACGTAGC TGTACTACTC ACGACTACAG
-           CGTAGACTAG TACGAGTATG TACTCTCGTG TAGAGACGAG TCGTCGCTCG
-           ACATACGCGT ACGCGAGTAT ACTACTATGT ACTGTACAGT AGACTATACT
-           AGCGTCGTCT AGTACGCTAT ATAGAGTACT CACGCTACGT CAGTAGACGT
-           CGACGTGACT TACACACACT TACACGTGAT TACAGATCGT TACGCTGTCT
-           TAGTGTAGAT TCGATCACGT TCGCACTAGT TCTAGCGACT TCTATACTAT
-           TGACGTATGT TGTGAGTAGT ACAGTATATA ACGCGATCGA ACTAGCAGTA
-           AGCTCACGTA AGTATACATA AGTCGAGAGA AGTGCTACGA CGATCGTATA
-           CGCAGTACGA CGCGTATACA CGTACAGTCA CGTACTCAGA CTACGCTCTA
-           CTATAGCGTA TACGTCATCA TAGTCGCATA TATATATACA TATGCTAGTA
-           TCACGCGAGA TCGATAGTGA TCGCTGCGTA TCTGACGTCA TGAGTCAGTA
-           TGTAGTGTGA TGTCACACGA TGTCGTCGCA ACACATACGC ACAGTCGTGC
-           ACATGACGAC ACGACAGCTC ACGTCTCATC ACTCATCTAC ACTCGCGCAC
-           AGAGCGTCAC AGCGACTAGC AGTAGTGATC AGTGACACAC AGTGTATGTC
-           ATAGATAGAC ATATAGTCGC ATCTACTGAC CACGTAGATC CACGTGTCGC
-           CATACTCTAC CGACACTATC CGAGACGCGC CGTATGCGAC CGTCGATCTC
-           CTACGACTGC CTAGTCACTC CTCTACGCTC CTGTACATAC TAGACTGCAC
-           TAGCGCGCGC TAGCTCTATC TATAGACATC TATGATACGC TCACTCATAC
-           TCATCGAGTC TCGAGCTCTC TCGCAGACAC TCTGTCTCGC TGAGTGACGC
-           TGATGTGTAC TGCTATAGAC TGCTCGCTAC ACGTGCAGCG ACTCACAGAG
-           AGACTCAGCG AGAGAGTGTG AGCTATCGCG AGTCTGACTG AGTGAGCTCG
-           ATAGCTCTCG ATCACGTGCG ATCGTAGCAG ATCGTCTGTG ATGTACGATG
-           ATGTGTCTAG CACACGATAG CACTCGCACG CAGACGTCTG CAGTACTGCG
-           CGACAGCGAG CGATCTGTCG CGCGTGCTAG CGCTCGAGTG CGTGATGACG
-           CTATGTACAG CTCGATATAG CTCGCACGCG CTGCGTCACG CTGTGCGTCG
-           TAGCATACTG TATACATGTG TATCACTCAG TATCTGATAG TCGTGACATG
-           TCTGATCGAG TGACATCTCG TGAGCTAGAG TGATAGAGCG TGCGTGTGCG
-           TGCTAGTCAG TGTATCACAG TGTGCGCGTG ACACGACGAC ACACGTAGTA
-           ACACTACTCG ACGACACGTA ACGAGTAGAC ACGCGTCTAG ACGTACACAC
-           ACGTACTGTG ACGTAGATCG ACTACGTCTC ACTATACGAG ACTCGCGTCG
+GS_SIZE = 10
+RL_SIZE = 11
+
+GSMID_HASH = {
+  ACGAGTGCGT: "MID1",
+  TCTCTATGCG: "MID10",
+  TAGACTGCAC: "MID100",
+  TAGCGCGCGC: "MID101",
+  TAGCTCTATC: "MID102",
+  TATAGACATC: "MID103",
+  TATGATACGC: "MID104",
+  TCACTCATAC: "MID105",
+  TCATCGAGTC: "MID106",
+  TCGAGCTCTC: "MID107",
+  TCGCAGACAC: "MID108",
+  TCTGTCTCGC: "MID109",
+  TGATACGTCT: "MID11",
+  TGAGTGACGC: "MID110",
+  TGATGTGTAC: "MID111",
+  TGCTATAGAC: "MID112",
+  TGCTCGCTAC: "MID113",
+  ACGTGCAGCG: "MID114",
+  ACTCACAGAG: "MID115",
+  AGACTCAGCG: "MID116",
+  AGAGAGTGTG: "MID117",
+  AGCTATCGCG: "MID118",
+  AGTCTGACTG: "MID119",
+  TACTGAGCTA: "MID12",
+  AGTGAGCTCG: "MID120",
+  ATAGCTCTCG: "MID121",
+  ATCACGTGCG: "MID122",
+  ATCGTAGCAG: "MID123",
+  ATCGTCTGTG: "MID124",
+  ATGTACGATG: "MID125",
+  ATGTGTCTAG: "MID126",
+  CACACGATAG: "MID127",
+  CACTCGCACG: "MID128",
+  CAGACGTCTG: "MID129",
+  CATAGTAGTG: "MID13",
+  CAGTACTGCG: "MID130",
+  CGACAGCGAG: "MID131",
+  CGATCTGTCG: "MID132",
+  CGCGTGCTAG: "MID133",
+  CGCTCGAGTG: "MID134",
+  CGTGATGACG: "MID135",
+  CTATGTACAG: "MID136",
+  CTCGATATAG: "MID137",
+  CTCGCACGCG: "MID138",
+  CTGCGTCACG: "MID139",
+  CGAGAGATAC: "MID14",
+  CTGTGCGTCG: "MID140",
+  TAGCATACTG: "MID141",
+  TATACATGTG: "MID142",
+  TATCACTCAG: "MID143",
+  TATCTGATAG: "MID144",
+  TCGTGACATG: "MID145",
+  TCTGATCGAG: "MID146",
+  TGACATCTCG: "MID147",
+  TGAGCTAGAG: "MID148",
+  TGATAGAGCG: "MID149",
+  ATACGACGTA: "MID15",
+  TGCGTGTGCG: "MID150",
+  TGCTAGTCAG: "MID151",
+  TGTATCACAG: "MID152",
+  TGTGCGCGTG: "MID153",
+  TCACGTACTA: "MID16",
+  CGTCTAGTAC: "MID17",
+  TCTACGTAGC: "MID18",
+  TGTACTACTC: "MID19",
+  ACGCTCGACA: "MID2",
+  ACGACTACAG: "MID20",
+  CGTAGACTAG: "MID21",
+  TACGAGTATG: "MID22",
+  TACTCTCGTG: "MID23",
+  TAGAGACGAG: "MID24",
+  TCGTCGCTCG: "MID25",
+  ACATACGCGT: "MID26",
+  ACGCGAGTAT: "MID27",
+  ACTACTATGT: "MID28",
+  ACTGTACAGT: "MID29",
+  AGACGCACTC: "MID3",
+  AGACTATACT: "MID30",
+  AGCGTCGTCT: "MID31",
+  AGTACGCTAT: "MID32",
+  ATAGAGTACT: "MID33",
+  CACGCTACGT: "MID34",
+  CAGTAGACGT: "MID35",
+  CGACGTGACT: "MID36",
+  TACACACACT: "MID37",
+  TACACGTGAT: "MID38",
+  TACAGATCGT: "MID39",
+  AGCACTGTAG: "MID4",
+  TACGCTGTCT: "MID40",
+  TAGTGTAGAT: "MID41",
+  TCGATCACGT: "MID42",
+  TCGCACTAGT: "MID43",
+  TCTAGCGACT: "MID44",
+  TCTATACTAT: "MID45",
+  TGACGTATGT: "MID46",
+  TGTGAGTAGT: "MID47",
+  ACAGTATATA: "MID48",
+  ACGCGATCGA: "MID49",
+  ATCAGACACG: "MID5",
+  ACTAGCAGTA: "MID50",
+  AGCTCACGTA: "MID51",
+  AGTATACATA: "MID52",
+  AGTCGAGAGA: "MID53",
+  AGTGCTACGA: "MID54",
+  CGATCGTATA: "MID55",
+  CGCAGTACGA: "MID56",
+  CGCGTATACA: "MID57",
+  CGTACAGTCA: "MID58",
+  CGTACTCAGA: "MID59",
+  ATATCGCGAG: "MID6",
+  CTACGCTCTA: "MID60",
+  CTATAGCGTA: "MID61",
+  TACGTCATCA: "MID62",
+  TAGTCGCATA: "MID63",
+  TATATATACA: "MID64",
+  TATGCTAGTA: "MID65",
+  TCACGCGAGA: "MID66",
+  TCGATAGTGA: "MID67",
+  TCGCTGCGTA: "MID68",
+  TCTGACGTCA: "MID69",
+  CGTGTCTCTA: "MID7",
+  TGAGTCAGTA: "MID70",
+  TGTAGTGTGA: "MID71",
+  TGTCACACGA: "MID72",
+  TGTCGTCGCA: "MID73",
+  ACACATACGC: "MID74",
+  ACAGTCGTGC: "MID75",
+  ACATGACGAC: "MID76",
+  ACGACAGCTC: "MID77",
+  ACGTCTCATC: "MID78",
+  ACTCATCTAC: "MID79",
+  CTCGCGTGTC: "MID8",
+  ACTCGCGCAC: "MID80",
+  AGAGCGTCAC: "MID81",
+  AGCGACTAGC: "MID82",
+  AGTAGTGATC: "MID83",
+  AGTGACACAC: "MID84",
+  AGTGTATGTC: "MID85",
+  ATAGATAGAC: "MID86",
+  ATATAGTCGC: "MID87",
+  ATCTACTGAC: "MID88",
+  CACGTAGATC: "MID89",
+  TAGTATCAGC: "MID9",
+  CACGTGTCGC: "MID90",
+  CATACTCTAC: "MID91",
+  CGACACTATC: "MID92",
+  CGAGACGCGC: "MID93",
+  CGTATGCGAC: "MID94",
+  CGTCGATCTC: "MID95",
+  CTACGACTGC: "MID96",
+  CTAGTCACTC: "MID97",
+  CTCTACGCTC: "MID98",
+  CTGTACATAC: "MID99",
 }
 
+RLMID_HASH = {
+  ACACGACGACT: "RL1",
+  ACACGTAGTAT: "RL2",
+  ACACTACTCGT: "RL3",
+  ACGACACGTAT: "RL4",
+  ACGAGTAGACT: "RL5",
+  ACGCGTCTAGT: "RL6",
+  ACGTACACACT: "RL7",
+  ACGTACTGTGT: "RL8",
+  ACGTAGATCGT: "RL9",
+  ACTACGTCTCT: "RL10",
+  ACTATACGAGT: "RL11",
+  ACTCGCGTCGT: "RL12" 
+}
 
-mid_hash = {}
+class MIDfinder
+  def initialize(mid_hash, pos, size, max_mismatches)
+    @mid_hash       = mid_hash
+    @pos            = pos
+    @size           = size
+    @max_mismatches = max_mismatches
+  end
 
-mids.each_with_index do |mid, i|
-  mid_hash[mid] = i
-end
+  def find_mid(seq)
+    hamming_dist = 0
+    tag          = seq[@pos ... @pos + @size].upcase.to_sym
 
-casts = []
-casts << {:long=>'pos', :short=>'p', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>nil, :disallowed=>nil}
+    if @mid_hash.has_key? tag
+      return MID.new(tag, @mid_hash[tag], @pos, @size, hamming_dist)
+    elsif @max_mismatches > 0
+      @mid_hash.each_pair do |mid, mid_name|
+        hamming_dist = tag.to_s.hamming_distance(mid.to_s)
 
-bp = Biopieces.new
+        if hamming_dist <= @max_mismatches
+          return MID.new(mid, @mid_hash[mid], @pos, @size, hamming_dist)
+        end
+      end
+    end
 
-options = bp.parse(ARGV, casts)
+    nil
+  end
+
+  private
+
+  # Class to hold a MID object.
+  class MID
+    attr_accessor :tag, :name, :pos, :len, :mismatches
 
-pos = options[:pos]
+    def initialize(tag, name, pos, len, mismatches)
+      @tag        = tag
+      @name       = name
+      @pos        = pos
+      @len        = len
+      @mismatches = mismatches
+    end
 
-bp.each_record do |record|
-  if record.has_key? :SEQ
-    tag = record[:SEQ][pos ... pos + MID_LEN].upcase
+    def to_hash
+      hash = {}
+      hash[:MID]            = @tag
+      hash[:MID_NAME]       = @name
+      hash[:MID_POS]        = @pos
+      hash[:MID_LEN]        = @len
+      hash[:MID_MISMATCHES] = @mismatches
+      hash
+    end
 
-    if mid_hash[tag]
-      record[:SEQ]     = record[:SEQ][pos + MID_LEN ... record[:SEQ].length]
-      record[:SCORES]  = record[:SCORES][pos + MID_LEN ... record[:SCORES].length] if record[:SCORES]
-      record[:MID]     = tag
-      record[:MID_NUM] = mid_hash[tag] + 1
-      record[:SEQ_LEN] = record[:SEQ].length
+    def start
+      @pos + @len
     end
   end
+end
+casts = []
+casts << {:long=>'pos',        :short=>'p', :type=>'uint', :mandatory=>false, :default=>0,   :allowed=>nil,     :disallowed=>nil}
+casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>0,   :allowed=>"0,1,2", :disallowed=>nil}
+casts << {:long=>'gsmids',     :short=>'g', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil,     :disallowed=>nil}
+casts << {:long=>'rlmids',     :short=>'r', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil,     :disallowed=>nil}
+
+options = Biopieces.options_parse(ARGV, casts)
+
+raise "Can't remove only GSMIDs and only RLMIDs simultaniously." if options[:rlmids] and options[:gsmids]
+
+pos         = options[:pos]
+mismatches  = options[:mismatches]
+
+gsfinder = MIDfinder.new(GSMID_HASH, options[:pos], GS_SIZE, options[:mismatches])
+rlfinder = MIDfinder.new(RLMID_HASH, options[:pos], RL_SIZE, options[:mismatches])
+
+Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
+  input.each_record do |record|
+    if record.has_key? :SEQ
 
-  bp.puts record
+      if options[:gsmids]
+        mid = gsfinder.find_mid(record[:SEQ])
+      elsif options[:rlmids]
+        mid = rlfinder.find_mid(record[:SEQ])
+      else
+        mid = gsfinder.find_mid(record[:SEQ]) || rlfinder.find_mid(record[:SEQ])
+      end
+
+      if mid
+        record.merge!(mid.to_hash)
+
+        record[:SEQ]     = record[:SEQ][mid.start .. -1]
+        record[:SCORES]  = record[:SCORES][mid.start .. -1] if record[:SCORES]
+        record[:SEQ_LEN] = record[:SEQ].length
+      end
+    end
+
+    output.puts record
+  end
 end
 
+
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<