require 'pp'
require 'maasha/biopieces'
-class TaxTable
- def initialize
- @q_hash = Hash.new { |h, k| h[k] = [] }
+# Class containing methods to construct a taxonomic tree recursively
+# for the classification of organisms. Currently only works with GreenGenes type of entries
+class TaxNode
+ attr_accessor :level, :name, :count, :score, :children
+
+ # Method to initalize a TaxNode object.
+ def initialize(level, name, count, score)
+ @level = level # Taxonomic level e.g. phylum, class, etc
+ @name = name # Name of organism
+ @count = count # Number of times this organism was encountered
+ @score = score # Similarity score
+ @children = {}
end
- # Method to add to the TaxTree a GreenGenes entry.
- def add_gg(q_id, s_id, score)
- hash = {}
+ # Method to add to the taxonomic tree a GreenGenes entry.
+ def add_gg(s_id, score, size)
+ node = self
- s_id.scan(/ ([\w])__([^;]+)/) do
- level = $1
+ s_id.scan(/([\w])__([^;]+)/) do
+ level = expand_level($1)
name = $2
- hash[expand_level(level).to_sym] = name
- end
+ if node.children[name].nil?
+ node.children[name] = TaxNode.new(level, name, size * 1, size * score)
+ else
+ node.children[name].count += size * 1
+ node.children[name].score += size * score
+ end
- @q_hash[q_id.to_sym] << hash
+ node = node.children[name]
+ end
end
- def lowest_common_ancestor
- @q_hash.each do |q_id, list|
- count_hash = Hash.new { |h, k| h[k] = Hash.new(0) }
-
- list.each do |elem|
- elem.each do |level, name|
- count_hash[level][name] += 1
- end
- end
+ # Method to merge two TaxNodes.
+ def merge(node_new, node_old = self)
+ node_new.children.each do |name, child|
+ if node_old.children[name]
+ node_old.children[name].count += child.count
+ node_old.children[name].score += child.score
- list.each do |elem|
- elem.each do |level, name|
- elem.delete(level) if count_hash[level].size > 1
- end
+ merge(child, node_old.children[name])
+ else
+ node_old.children[name] = child
end
end
end
- def uniq
- lookup_hash = Hash.new
- new_hash = Hash.new { |h, k| h[k] = [] }
+ # Method to flatten a taxonomic tree turning this into a list by recursive depth first traversal.
+ def flatten(node = self, list = [])
+ list << TaxNode.new(node.level, node.name, node.count, node.score)
- @q_hash.each do |q_id, list|
- list.each do |elem|
- unless lookup_hash[elem.to_s]
- new_hash[q_id] << elem
- lookup_hash[elem.to_s] = true
- end
- end
+ node.children.each_value do |child|
+ flatten(child, list)
end
- @q_hash = new_hash
+ list
end
- def count
- count_hash = Hash.new { |h, k| h[k] = Hash.new(0) }
- new_hash = Hash.new { |h, k| h[k] = [] }
+ # Method to recursively remove branches in taxonomic tree where the child count is less than or
+ # equal to a given minimum.
+ def debranch(min_count = nil)
+ node = self
- @q_hash.each do |q_id, list|
- list.each do |elem|
- elem.each do |level, name|
- count_hash[level][name] += 1
- end
- end
+ node.children.each do |name, child|
+ node.children.delete(name) if child.count <= min_count
end
- @q_hash.each do |q_id, list|
- new_list = []
+ node.children.each_value do |child|
+ child.debranch(min_count)
+ end
+ end
- list.each do |elem|
- new_elem = {}
-
- elem.each do |level, name|
- new_elem[level] = name
- new_elem["#{level}_count".to_sym] = count_hash[level][name]
- end
+ # Method to recursively trim a taxonomic tree so that it only contains an unbranched tree,
+ # which gives the lowest common ancestor.
+ def lowest_common_ancestor
+ node = self
- new_list << new_elem
- end
+ node.children = {} if node.children.size > 1
- new_hash[q_id] = new_list
+ node.children.each_value do |child|
+ child.lowest_common_ancestor
end
-
- @q_hash = new_hash
end
+ # Method for iterating over a taxonomic tree.
def each
- @q_hash.each do |q_id, list|
- list.each do |elem|
- yield elem
- end
+ self.flatten.each do |node|
+ yield node
end
+
+ self
+ end
+
+ # Method to convert a TaxNode to a Biopiece record.
+ def to_bp
+ record = {}
+ record[:REC_TYPE] = "Classification"
+ record[:LEVEL] = @level
+ record[:NAME] = @name
+ record[:COUNT] = @count
+ record[:SCORE] = @count == 0 ? 0.0 : (@score / @count).round(2)
+
+ record
end
private
+ # Method containing a helper hash to expand the phylogenetic level name.
def expand_level(level)
- case level
- when 'd' then level = "domain"
- when 'k' then level = "kingdom"
- when 'p' then level = "phylum"
- when 'c' then level = "class"
- when 'o' then level = "order"
- when 'f' then level = "family"
- when 'g' then level = "genus"
- when 's' then level = "species"
- else
- raise "unknown level: #{level}"
- end
-
- level
+ hash = {
+ 'd' => "domain",
+ 'k' => "kingdom",
+ 'p' => "phylum",
+ 'c' => "class",
+ 'o' => "order",
+ 'f' => "family",
+ 'g' => "genus",
+ 's' => "species"
+ }
+
+ raise "unknown level: #{level}" unless hash[level]
+
+ hash[level]
end
end
casts = []
-casts << {:long=>'LCA', :short=>'l', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'uniq', :short=>'u', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'LCA', :short=>'l', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'size', :short=>'s', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'min_count', :short=>'m', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
options = Biopieces.options_parse(ARGV, casts)
-taxtab = TaxTable.new
+tax_tree = TaxNode.new("root", "Root", 0, 0.0)
+tax_hash = {}
Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
input.each_record do |record|
if record[:Q_ID] and record[:S_ID] and record[:SCORE]
- taxtab.add_gg(record[:Q_ID], record[:S_ID], record[:SCORE])
+ size = 1
+
+ if options[:size]
+ if record[:Q_ID].match(/_(\d+)$/)
+ size = $1.to_i
+# else
+# raise BiopiecesError, "Could not extract size from Q_ID: #{record[:Q_ID]}"
+ end
+ end
+
+ if options[:LCA]
+ tax_hash[record[:Q_ID]] = TaxNode.new("root", "Root", 0, 0.0) unless tax_hash[record[:Q_ID]]
+ tax_hash[record[:Q_ID]].add_gg(record[:S_ID], record[:SCORE].to_f, size)
+ else
+ tax_tree.add_gg(record[:S_ID], record[:SCORE].to_f, size)
+ end
+ else
+ output.puts record
end
end
- taxtab.lowest_common_ancestor if options[:LCA]
- taxtab.uniq if options[:uniq]
- taxtab.count
-
- taxtab.each do |tax|
- record = {}
+ if options[:LCA]
+ tax_hash.each_value do |tree|
+ tree.debranch(options[:min_count]) if options[:min_count]
+ tree.lowest_common_ancestor
+ end
- tax.each { |k, v| record[k.upcase] = v }
+ tax_hash.each_value do |tree|
+ tax_tree.merge(tree)
+ end
+ end
- output.puts record
+ tax_tree.each do |node|
+ output.puts node.to_bp unless node.level == 'root'
end
end
-
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<