require 'pp'
require 'maasha/biopieces'
-# Class containing methods to construct a taxonomic tree for the classification of
-# organisms. Currently only works with GreenGenes type of entries.
-class TaxTree
- # Method to initialize a new TaxTree object.
- def initialize
- @tree = TaxNode.new("root", "Root", 0, 0, 0.0)
+# Class containing methods to construct a taxonomic tree recursively
+# for the classification of organisms. Currently only works with GreenGenes type of entries
+class TaxNode
+ attr_accessor :level, :name, :count, :score, :children
+
+ # Method to initalize a TaxNode object.
+ def initialize(level, name, count, score)
+ @level = level # Taxonomic level e.g. phylum, class, etc
+ @name = name # Name of organism
+ @count = count # Number of times this organism was encountered
+ @score = score # Similarity score
+ @children = {}
end
- # Method to add to the TaxTree a GreenGenes entry.
+ # Method to add to the taxonomic tree a GreenGenes entry.
def add_gg(s_id, score, size)
- node = @tree
+ node = self
s_id.scan(/ ([\w])__([^;]+)/) do
level = expand_level($1)
name = $2
if node.children[name].nil?
- node.children[name] = TaxNode.new(level, name, 1, size, score)
+ node.children[name] = TaxNode.new(level, name, size * 1, size * score)
else
- node.children[name].count += 1
- node.children[name].score += score
+ node.children[name].count += size * 1
+ node.children[name].score += size * score
end
node = node.children[name]
end
end
- # Method to merge two TaxTrees.
- def merge(tree)
- node = @tree
-
- tree.flatten.each do |new_node|
- next if new_node.level == 'root'
-
- new_node.score = new_node.score / new_node.count
- new_node.count = 1
-
- if node.children[new_node.name].nil?
- node.children[new_node.name] = new_node
+ # Method to merge two TaxNodes.
+ def merge(node_new, node_old = self)
+ node_new.children.each do |name, child|
+ if node_old.children[name]
+ node_old.count += node_new.count
+ node_old.score += node_new.score
else
- node.children[new_node.name].count += new_node.count
- node.children[new_node.name].score += new_node.score
- node.children[new_node.name].size += new_node.size
+ node_old.children[name] = child
end
- node = node.children[new_node.name]
+ merge(child, node_old.children[name])
end
end
- # Method to flatten a TaxTree turning this into a list by recursive depth first traversal.
- def flatten(node = @tree, list = [])
- list << TaxNode.new(node.level, node.name, node.count, node.size, node.score)
+ # Method to flatten a taxonomic tree turning this into a list by recursive depth first traversal.
+ def flatten(node = self, list = [])
+ list << TaxNode.new(node.level, node.name, node.count, node.score)
- node.children.each do |name, child|
- list = flatten(child, list.dup)
+ node.children.each_value do |child|
+ flatten(child, list)
end
list
end
- # Method to recursively trim a TaxTree show that it only contains a unbranched tree,
+ # Method to recursively trim a taxonomic tree so that it only contains an unbranched tree,
# which gives the lowest common ancestor.
- def lowest_common_ancestor(node = @tree)
+ def lowest_common_ancestor(node = self)
node.children = {} if node.children.size > 1
node.children.each do |name, child|
end
end
- # Method for iterating over a TaxTree.
+ # Method for iterating over a taxonomic tree.
def each
self.flatten.each do |node|
yield node
self
end
+ # Method to convert a TaxNode to a Biopiece record.
+ def to_bp
+ record = {}
+ record[:REC_TYPE] = "Classification"
+ record[:LEVEL] = @level
+ record[:NAME] = @name
+ record[:COUNT] = @count
+ record[:SCORE] = @count == 0 ? 0.0 : (@score / @count).round(2)
+
+ record
+ end
+
private
# Method containing a helper hash to expand the phylogenetic level name.
hash[level]
end
-
- # Nested class for TaxTree nodes.
- class TaxNode
- attr_accessor :level, :name, :count, :size, :score, :children
-
- def initialize(level, name, count, size, score)
- @level = level
- @name = name
- @count = count
- @size = size
- @score = score
- @children = {}
- end
-
- # Method to convert a TaxNode to a Biopiece record.
- def to_bp
- record = {}
- record[:REC_TYPE] = "Classification"
- record[:LEVEL] = @level
- record[:NAME] = @name
- record[:COUNT] = @size
- record[:SCORE] = @count == 0 ? 0.0 : (@score / @count).round(2)
-
- record
- end
- end
end
casts = []
options = Biopieces.options_parse(ARGV, casts)
-tax_tree = TaxTree.new
+tax_tree = TaxNode.new("root", "Root", 0, 0.0)
tax_hash = {}
Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
end
end
- tax_hash[record[:Q_ID]] = TaxTree.new unless tax_hash[record[:Q_ID]]
- tax_hash[record[:Q_ID]].add_gg(record[:S_ID], record[:SCORE].to_f, size)
+ if options[:LCA]
+ tax_hash[record[:Q_ID]] = TaxNode.new("root", "Root", 0, 0.0) unless tax_hash[record[:Q_ID]]
+ tax_hash[record[:Q_ID]].add_gg(record[:S_ID], record[:SCORE].to_f, size)
+ else
+ tax_tree.add_gg(record[:S_ID], record[:SCORE].to_f, size)
+ end
else
output.puts record
end
end
if options[:LCA]
- tax_hash.each do |q_id, tree|
+ tax_hash.each_value do |tree|
tree.lowest_common_ancestor
end
- end
- tax_hash.each do |q_id, tree|
- tax_tree.merge(tree)
+ tax_hash.each_value do |tree|
+ tax_tree.merge(tree)
+ end
end
tax_tree.each do |node|