require 'maasha/bits'
require 'pp'
-gsmid_hash = {
+GS_SIZE = 10
+RL_SIZE = 11
+
+GSMID_HASH = {
ACGAGTGCGT: "MID1",
TCTCTATGCG: "MID10",
TAGACTGCAC: "MID100",
CTGTACATAC: "MID99",
}
-rlmid_hash = {
+RLMID_HASH = {
ACACGACGACT: "RL1",
ACACGTAGTAT: "RL2",
ACACTACTCGT: "RL3",
ACTCGCGTCGT: "RL12"
}
-casts = []
-casts << {:long=>'pos', :short=>'p', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>"0,1,2", :disallowed=>nil}
+class MIDfinder
+ def initialize(mid_hash, pos, size, max_mismatches)
+ @mid_hash = mid_hash
+ @pos = pos
+ @size = size
+ @max_mismatches = max_mismatches
+ end
-options = Biopieces.options_parse(ARGV, casts)
+ def find_mid(seq)
+ hamming_dist = 0
+ tag = seq[@pos ... @pos + @size].upcase.to_sym
-pos = options[:pos]
-mismatches = options[:mismatches]
+ if @mid_hash.has_key? tag
+ return MID.new(tag, @mid_hash[tag], @pos, @size, hamming_dist)
+ elsif @max_mismatches > 0
+ @mid_hash.each_pair do |mid, mid_name|
+ hamming_dist = tag.to_s.hamming_distance(mid.to_s)
-Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
- input.each_record do |record|
- if record.has_key? :SEQ
- found_tag = nil
- hamming_dist = 0
+ if hamming_dist <= @max_mismatches
+ return MID.new(mid, @mid_hash[mid], @pos, @size, hamming_dist)
+ end
+ end
+ end
- tag10 = record[:SEQ][pos ... pos + 10].upcase.to_sym
- tag11 = record[:SEQ][pos ... pos + 11].upcase.to_sym
+ nil
+ end
- if gsmid_hash.has_key? tag10
- found_tag = tag10
- elsif rlmid_hash.has_key? tag11
- found_tag = tag11
- end
+ private
- if mismatches > 0 and found_tag.nil?
- if found_tag.nil?
- gsmid_hash.each_pair do |mid, mid_name|
- hamming_dist = tag10.to_s.hamming_distance(mid.to_s)
+ # Class to hold a MID object.
+ class MID
+ attr_accessor :tag, :name, :pos, :len, :mismatches
- if hamming_dist <= mismatches
- found_tag = mid
+ def initialize(tag, name, pos, len, mismatches)
+ @tag = tag
+ @name = name
+ @pos = pos
+ @len = len
+ @mismatches = mismatches
+ end
- break
- end
- end
- end
+ def to_hash
+ hash = {}
+ hash[:MID] = @tag
+ hash[:MID_NAME] = @name
+ hash[:MID_POS] = @pos
+ hash[:MID_LEN] = @len
+ hash[:MID_MISMATCHES] = @mismatches
+ hash
+ end
+
+ def start
+ @pos + @len
+ end
+ end
+end
+casts = []
+casts << {:long=>'pos', :short=>'p', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>"0,1,2", :disallowed=>nil}
+casts << {:long=>'gsmids', :short=>'g', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'rlmids', :short=>'r', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
- if found_tag.nil?
- rlmid_hash.each_pair do |mid, mid_name|
- hamming_dist = tag11.to_s.hamming_distance(mid.to_s)
+options = Biopieces.options_parse(ARGV, casts)
- if hamming_dist <= mismatches
- found_tag = mid
+raise "Can't remove only GSMIDs and only RLMIDs simultaniously." if options[:rlmids] and options[:gsmids]
- break
- end
- end
- end
- end
+pos = options[:pos]
+mismatches = options[:mismatches]
- unless found_tag.nil?
- mid_name = (found_tag.length == 10) ? gsmid_hash[found_tag] : rlmid_hash[found_tag]
+gsfinder = MIDfinder.new(GSMID_HASH, options[:pos], GS_SIZE, options[:mismatches])
+rlfinder = MIDfinder.new(RLMID_HASH, options[:pos], RL_SIZE, options[:mismatches])
- record[:MID] = found_tag
- record[:MID_NAME] = mid_name
- record[:MID_POS] = pos
- record[:MID_LEN] = found_tag.length
- record[:MID_MISMATCHES] = hamming_dist
+Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
+ input.each_record do |record|
+ if record.has_key? :SEQ
+
+ if options[:gsmids]
+ mid = gsfinder.find_mid(record[:SEQ])
+ elsif options[:rlmids]
+ mid = rlfinder.find_mid(record[:SEQ])
+ else
+ mid = gsfinder.find_mid(record[:SEQ]) || rlfinder.find_mid(record[:SEQ])
+ end
- start = pos + found_tag.length
+ if mid
+ record.merge!(mid.to_hash)
- record[:SEQ] = record[:SEQ][start .. -1]
- record[:SCORES] = record[:SCORES][start .. -1] if record[:SCORES]
+ record[:SEQ] = record[:SEQ][mid.start .. -1]
+ record[:SCORES] = record[:SCORES][mid.start .. -1] if record[:SCORES]
record[:SEQ_LEN] = record[:SEQ].length
end
end
output.puts record
end
end
-
-
+
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-
-
+
+
__END__