require 'pp'
require 'maasha/biopieces'
-class TaxTable
+class TaxTree
def initialize
- @q_hash = Hash.new { |h, k| h[k] = [] }
+ @tree = TaxNode.new("root", "Root", 0, 0, 0.0)
end
# Method to add to the TaxTree a GreenGenes entry.
- def add_gg(q_id, s_id, score)
- hash = {}
+ def add_gg(s_id, score, size)
+ node = @tree
s_id.scan(/ ([\w])__([^;]+)/) do
- level = $1
+ level = expand_level($1)
name = $2
- hash[expand_level(level).to_sym] = name
- end
-
- @q_hash[q_id.to_sym] << hash
- end
-
- def lowest_common_ancestor
- @q_hash.each do |q_id, list|
- count_hash = Hash.new { |h, k| h[k] = Hash.new(0) }
-
- list.each do |elem|
- elem.each do |level, name|
- count_hash[level][name] += 1
- end
+ if node.children[name].nil?
+ node.children[name] = TaxNode.new(level, name, 1, size, score)
+ else
+ node.children[name].count += 1
+ node.children[name].score += score
end
- list.each do |elem|
- elem.each do |level, name|
- elem.delete(level) if count_hash[level].size > 1
- end
- end
+ node = node.children[name]
end
end
- def uniq
- lookup_hash = Hash.new
- new_hash = Hash.new { |h, k| h[k] = [] }
+ def merge(tree)
+ node = @tree
- @q_hash.each do |q_id, list|
- list.each do |elem|
- unless lookup_hash[elem.to_s]
- new_hash[q_id] << elem
- lookup_hash[elem.to_s] = true
- end
+ tree.flatten.each do |new_node|
+ next if new_node.level == 'root'
+
+ new_node.score = new_node.score / new_node.count
+ new_node.count = 1
+
+ if node.children[new_node.name].nil?
+ node.children[new_node.name] = new_node
+ else
+ node.children[new_node.name].count += new_node.count
+ node.children[new_node.name].score += new_node.score
+ node.children[new_node.name].size += new_node.size
end
- end
- @q_hash = new_hash
+ node = node.children[new_node.name]
+ end
end
- def count
- count_hash = Hash.new { |h, k| h[k] = Hash.new(0) }
- new_hash = Hash.new { |h, k| h[k] = [] }
+ def flatten(node = @tree, list = [])
+ list << TaxNode.new(node.level, node.name, node.count, node.size, node.score)
- @q_hash.each do |q_id, list|
- list.each do |elem|
- elem.each do |level, name|
- count_hash[level][name] += 1
- end
- end
+ node.children.each do |name, child|
+ list = flatten(child, list.dup)
end
- @q_hash.each do |q_id, list|
- new_list = []
-
- list.each do |elem|
- new_elem = {}
-
- elem.each do |level, name|
- new_elem[level] = name
- new_elem["#{level}_count".to_sym] = count_hash[level][name]
- end
+ list
+ end
- new_list << new_elem
- end
+ def lowest_common_ancestor(node = @tree)
+ node.children = {} if node.children.size > 1
- new_hash[q_id] = new_list
+ node.children.each do |name, child|
+ lowest_common_ancestor(child)
end
-
- @q_hash = new_hash
end
def each
- @q_hash.each do |q_id, list|
- list.each do |elem|
- yield elem
- end
+ self.flatten.each do |node|
+ yield node
end
+
+ self
end
private
def expand_level(level)
- case level
- when 'd' then level = "domain"
- when 'k' then level = "kingdom"
- when 'p' then level = "phylum"
- when 'c' then level = "class"
- when 'o' then level = "order"
- when 'f' then level = "family"
- when 'g' then level = "genus"
- when 's' then level = "species"
- else
- raise "unknown level: #{level}"
+ hash = {
+ 'd' => "domain",
+ 'k' => "kingdom",
+ 'p' => "phylum",
+ 'c' => "class",
+ 'o' => "order",
+ 'f' => "family",
+ 'g' => "genus",
+ 's' => "species"
+ }
+
+ raise "unknown level: #{level}" unless hash[level]
+
+ hash[level]
+ end
+
+ class TaxNode
+ attr_accessor :level, :name, :count, :size, :score, :children
+
+ def initialize(level, name, count, size, score)
+ @level = level
+ @name = name
+ @count = count
+ @size = size
+ @score = score
+ @children = {}
end
- level
+ def to_bp
+ record = {}
+ record[:REC_TYPE] = "Classification"
+ record[:LEVEL] = @level
+ record[:NAME] = @name
+ record[:COUNT] = @size
+ record[:SCORE] = @count == 0 ? 0.0 : (@score / @count).round(2)
+
+ record
+ end
end
end
casts = []
casts << {:long=>'LCA', :short=>'l', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'uniq', :short=>'u', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'size', :short=>'s', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
options = Biopieces.options_parse(ARGV, casts)
-taxtab = TaxTable.new
+tax_tree = TaxTree.new
+tax_hash = {}
Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
input.each_record do |record|
if record[:Q_ID] and record[:S_ID] and record[:SCORE]
- taxtab.add_gg(record[:Q_ID], record[:S_ID], record[:SCORE])
+ size = 1
+
+ if options[:size]
+ if record[:Q_ID].match(/_(\d+)$/)
+ size = $1.to_i
+ else
+ raise BiopiecesError, "Could not extract size from Q_ID: #{record[:Q_ID]}"
+ end
+ end
+
+ tax_hash[record[:Q_ID]] = TaxTree.new unless tax_hash[record[:Q_ID]]
+ tax_hash[record[:Q_ID]].add_gg(record[:S_ID], record[:SCORE].to_f, size)
+ else
+ output.puts record
end
end
- taxtab.lowest_common_ancestor if options[:LCA]
- taxtab.uniq if options[:uniq]
- taxtab.count
-
- taxtab.each do |tax|
- record = {}
+ if options[:LCA]
+ tax_hash.each do |q_id, tree|
+ tree.lowest_common_ancestor
+ end
+ end
- tax.each { |k, v| record[k.upcase] = v }
+ tax_hash.each do |q_id, tree|
+ tax_tree.merge(tree)
+ end
- output.puts record
+ tax_tree.each do |node|
+ output.puts node.to_bp unless node.level == 'root'
end
end
-
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<