require 'pp'
require 'maasha/biopieces'
-class TaxTree
- def initialize
- @tree = TaxNode.new("root", "Root", 0, 0, 0.0)
+# Class containing methods to construct a taxonomic tree recursively
+# for the classification of organisms. Currently only works with GreenGenes type of entries
+class TaxNode
+ attr_accessor :level, :name, :count, :score, :children
+
+ # Method to initalize a TaxNode object.
+ def initialize(level, name, count, score)
+ @level = level # Taxonomic level e.g. phylum, class, etc
+ @name = name # Name of organism
+ @count = count # Number of times this organism was encountered
+ @score = score # Similarity score
+ @children = {}
end
- # Method to add to the TaxTree a GreenGenes entry.
+ # Method to add to the taxonomic tree a GreenGenes entry.
def add_gg(s_id, score, size)
- node = @tree
+ node = self
- s_id.scan(/ ([\w])__([^;]+)/) do
+ s_id.scan(/([\w])__([^;]+)/) do
level = expand_level($1)
name = $2
if node.children[name].nil?
- node.children[name] = TaxNode.new(level, name, 1, size, score)
+ node.children[name] = TaxNode.new(level, name, size * 1, size * score)
else
- node.children[name].count += 1
- node.children[name].score += score
+ node.children[name].count += size * 1
+ node.children[name].score += size * score
end
node = node.children[name]
end
end
- def merge(tree)
- node = @tree
+ # Method to merge two TaxNodes.
+ def merge(node_new, node_old = self)
+ node_new.children.each do |name, child|
+ if node_old.children[name]
+ node_old.children[name].count += child.count
+ node_old.children[name].score += child.score
- tree.flatten.each do |new_node|
- next if new_node.level == 'root'
-
- new_node.score = new_node.score / new_node.count
- new_node.count = 1
-
- if node.children[new_node.name].nil?
- node.children[new_node.name] = new_node
+ merge(child, node_old.children[name])
else
- node.children[new_node.name].count += new_node.count
- node.children[new_node.name].score += new_node.score
- node.children[new_node.name].size += new_node.size
+ node_old.children[name] = child
end
+ end
+ end
- node = node.children[new_node.name]
+ # Method to flatten a taxonomic tree turning this into a list by recursive depth first traversal.
+ def flatten(node = self, list = [])
+ list << TaxNode.new(node.level, node.name, node.count, node.score)
+
+ node.children.each_value do |child|
+ flatten(child, list)
end
+
+ list
end
- def flatten(node = @tree, list = [])
- list << TaxNode.new(node.level, node.name, node.count, node.size, node.score)
+ # Method to recursively remove branches in taxonomic tree where the child count is less than or
+ # equal to a given minimum.
+ def debranch(min_count = nil)
+ node = self
node.children.each do |name, child|
- list = flatten(child, list.dup)
+ node.children.delete(name) if child.count <= min_count
end
- list
+ node.children.each_value do |child|
+ child.debranch(min_count)
+ end
end
- def lowest_common_ancestor(node = @tree)
+ # Method to recursively trim a taxonomic tree so that it only contains an unbranched tree,
+ # which gives the lowest common ancestor.
+ def lowest_common_ancestor
+ node = self
+
node.children = {} if node.children.size > 1
- node.children.each do |name, child|
- lowest_common_ancestor(child)
+ node.children.each_value do |child|
+ child.lowest_common_ancestor
end
end
+ # Method for iterating over a taxonomic tree.
def each
self.flatten.each do |node|
yield node
self
end
+ # Method to convert a TaxNode to a Biopiece record.
+ def to_bp
+ record = {}
+ record[:REC_TYPE] = "Classification"
+ record[:LEVEL] = @level
+ record[:NAME] = @name
+ record[:COUNT] = @count
+ record[:SCORE] = @count == 0 ? 0.0 : (@score / @count).round(2)
+
+ record
+ end
+
private
+ # Method containing a helper hash to expand the phylogenetic level name.
def expand_level(level)
hash = {
'd' => "domain",
hash[level]
end
-
- class TaxNode
- attr_accessor :level, :name, :count, :size, :score, :children
-
- def initialize(level, name, count, size, score)
- @level = level
- @name = name
- @count = count
- @size = size
- @score = score
- @children = {}
- end
-
- def to_bp
- record = {}
- record[:REC_TYPE] = "Classification"
- record[:LEVEL] = @level
- record[:NAME] = @name
- record[:COUNT] = @size
- record[:SCORE] = @count == 0 ? 0.0 : (@score / @count).round(2)
-
- record
- end
- end
end
casts = []
-casts << {:long=>'LCA', :short=>'l', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'size', :short=>'s', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'LCA', :short=>'l', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'size', :short=>'s', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'min_count', :short=>'m', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
options = Biopieces.options_parse(ARGV, casts)
-tax_tree = TaxTree.new
+tax_tree = TaxNode.new("root", "Root", 0, 0.0)
tax_hash = {}
Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
if options[:size]
if record[:Q_ID].match(/_(\d+)$/)
size = $1.to_i
- else
- raise BiopiecesError, "Could not extract size from Q_ID: #{record[:Q_ID]}"
+# else
+# raise BiopiecesError, "Could not extract size from Q_ID: #{record[:Q_ID]}"
end
end
- tax_hash[record[:Q_ID]] = TaxTree.new unless tax_hash[record[:Q_ID]]
- tax_hash[record[:Q_ID]].add_gg(record[:S_ID], record[:SCORE].to_f, size)
+ if options[:LCA]
+ tax_hash[record[:Q_ID]] = TaxNode.new("root", "Root", 0, 0.0) unless tax_hash[record[:Q_ID]]
+ tax_hash[record[:Q_ID]].add_gg(record[:S_ID], record[:SCORE].to_f, size)
+ else
+ tax_tree.add_gg(record[:S_ID], record[:SCORE].to_f, size)
+ end
else
output.puts record
end
end
if options[:LCA]
- tax_hash.each do |q_id, tree|
+ tax_hash.each_value do |tree|
+ tree.debranch(options[:min_count]) if options[:min_count]
tree.lowest_common_ancestor
end
- end
- tax_hash.each do |q_id, tree|
- tax_tree.merge(tree)
+ tax_hash.each_value do |tree|
+ tax_tree.merge(tree)
+ end
end
tax_tree.each do |node|