X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;ds=sidebyside;f=bp_bin%2Fclassify_taxonomy;h=64303fb6b582432ae052b2ceebb36684c3443f84;hb=b04ea01fc94d50b0f60b22ff8be7b21ce2c36987;hp=1614e5d020d1e6e732c2f99a5e5861dd9f7a6102;hpb=d68f8c54f74474676571d74a8309f7337fdb51d5;p=biopieces.git diff --git a/bp_bin/classify_taxonomy b/bp_bin/classify_taxonomy index 1614e5d..64303fb 100755 --- a/bp_bin/classify_taxonomy +++ b/bp_bin/classify_taxonomy @@ -31,69 +31,91 @@ require 'pp' require 'maasha/biopieces' -class TaxTree - def initialize - @tree = TaxNode.new("root", "Root", 0, 0, 0.0) +# Class containing methods to construct a taxonomic tree recursively +# for the classification of organisms. Currently only works with GreenGenes type of entries +class TaxNode + attr_accessor :level, :name, :count, :score, :children + + # Method to initalize a TaxNode object. + def initialize(level, name, count, score) + @level = level # Taxonomic level e.g. phylum, class, etc + @name = name # Name of organism + @count = count # Number of times this organism was encountered + @score = score # Similarity score + @children = {} end - # Method to add to the TaxTree a GreenGenes entry. + # Method to add to the taxonomic tree a GreenGenes entry. def add_gg(s_id, score, size) - node = @tree + node = self - s_id.scan(/ ([\w])__([^;]+)/) do + s_id.scan(/([\w])__([^;]+)/) do level = expand_level($1) name = $2 if node.children[name].nil? - node.children[name] = TaxNode.new(level, name, 1, size, score) + node.children[name] = TaxNode.new(level, name, size * 1, size * score) else - node.children[name].count += 1 - node.children[name].score += score + node.children[name].count += size * 1 + node.children[name].score += size * score end node = node.children[name] end end - def merge(tree) - node = @tree + # Method to merge two TaxNodes. + def merge(node_new, node_old = self) + node_new.children.each do |name, child| + if node_old.children[name] + node_old.children[name].count += child.count + node_old.children[name].score += child.score - tree.flatten.each do |new_node| - next if new_node.level == 'root' - - new_node.score = new_node.score / new_node.count - new_node.count = 1 - - if node.children[new_node.name].nil? - node.children[new_node.name] = new_node + merge(child, node_old.children[name]) else - node.children[new_node.name].count += new_node.count - node.children[new_node.name].score += new_node.score - node.children[new_node.name].size += new_node.size + node_old.children[name] = child end + end + end - node = node.children[new_node.name] + # Method to flatten a taxonomic tree turning this into a list by recursive depth first traversal. + def flatten(node = self, list = []) + list << TaxNode.new(node.level, node.name, node.count, node.score) + + node.children.each_value do |child| + flatten(child, list) end + + list end - def flatten(node = @tree, list = []) - list << TaxNode.new(node.level, node.name, node.count, node.size, node.score) + # Method to recursively remove branches in taxonomic tree where the child count is less than or + # equal to a given minimum. + def debranch(min_count = nil) + node = self node.children.each do |name, child| - list = flatten(child, list.dup) + node.children.delete(name) if child.count <= min_count end - list + node.children.each_value do |child| + child.debranch(min_count) + end end - def lowest_common_ancestor(node = @tree) + # Method to recursively trim a taxonomic tree so that it only contains an unbranched tree, + # which gives the lowest common ancestor. + def lowest_common_ancestor + node = self + node.children = {} if node.children.size > 1 - node.children.each do |name, child| - lowest_common_ancestor(child) + node.children.each_value do |child| + child.lowest_common_ancestor end end + # Method for iterating over a taxonomic tree. def each self.flatten.each do |node| yield node @@ -102,8 +124,21 @@ class TaxTree self end + # Method to convert a TaxNode to a Biopiece record. + def to_bp + record = {} + record[:REC_TYPE] = "Classification" + record[:LEVEL] = @level + record[:NAME] = @name + record[:COUNT] = @count + record[:SCORE] = @count == 0 ? 0.0 : (@score / @count).round(2) + + record + end + private + # Method containing a helper hash to expand the phylogenetic level name. def expand_level(level) hash = { 'd' => "domain", @@ -120,39 +155,16 @@ class TaxTree hash[level] end - - class TaxNode - attr_accessor :level, :name, :count, :size, :score, :children - - def initialize(level, name, count, size, score) - @level = level - @name = name - @count = count - @size = size - @score = score - @children = {} - end - - def to_bp - record = {} - record[:REC_TYPE] = "Classification" - record[:LEVEL] = @level - record[:NAME] = @name - record[:COUNT] = @size - record[:SCORE] = @count == 0 ? 0.0 : (@score / @count).round(2) - - record - end - end end casts = [] -casts << {:long=>'LCA', :short=>'l', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'size', :short=>'s', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'LCA', :short=>'l', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'size', :short=>'s', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'min_count', :short=>'m', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} options = Biopieces.options_parse(ARGV, casts) -tax_tree = TaxTree.new +tax_tree = TaxNode.new("root", "Root", 0, 0.0) tax_hash = {} Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| @@ -163,26 +175,31 @@ Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| if options[:size] if record[:Q_ID].match(/_(\d+)$/) size = $1.to_i - else - raise BiopiecesError, "Could not extract size from Q_ID: #{record[:Q_ID]}" +# else +# raise BiopiecesError, "Could not extract size from Q_ID: #{record[:Q_ID]}" end end - tax_hash[record[:Q_ID]] = TaxTree.new unless tax_hash[record[:Q_ID]] - tax_hash[record[:Q_ID]].add_gg(record[:S_ID], record[:SCORE].to_f, size) + if options[:LCA] + tax_hash[record[:Q_ID]] = TaxNode.new("root", "Root", 0, 0.0) unless tax_hash[record[:Q_ID]] + tax_hash[record[:Q_ID]].add_gg(record[:S_ID], record[:SCORE].to_f, size) + else + tax_tree.add_gg(record[:S_ID], record[:SCORE].to_f, size) + end else output.puts record end end if options[:LCA] - tax_hash.each do |q_id, tree| + tax_hash.each_value do |tree| + tree.debranch(options[:min_count]) if options[:min_count] tree.lowest_common_ancestor end - end - tax_hash.each do |q_id, tree| - tax_tree.merge(tree) + tax_hash.each_value do |tree| + tax_tree.merge(tree) + end end tax_tree.each do |node|