From: martinahansen Date: Fri, 12 Oct 2012 08:49:09 +0000 (+0000) Subject: fixed classify_taxonomy X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=45c20e71e46198e6dddc17913c5587f7133bf0cb;hp=c7ff322303c98722e7b37579a270001a6eacb40a;p=biopieces.git fixed classify_taxonomy git-svn-id: http://biopieces.googlecode.com/svn/trunk@1963 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/bp_bin/classify_taxonomy b/bp_bin/classify_taxonomy index 8c211de..f91964f 100755 --- a/bp_bin/classify_taxonomy +++ b/bp_bin/classify_taxonomy @@ -31,69 +31,67 @@ require 'pp' require 'maasha/biopieces' -# Class containing methods to construct a taxonomic tree for the classification of -# organisms. Currently only works with GreenGenes type of entries. -class TaxTree - # Method to initialize a new TaxTree object. - def initialize - @tree = TaxNode.new("root", "Root", 0, 0, 0.0) +# Class containing methods to construct a taxonomic tree recursively +# for the classification of organisms. Currently only works with GreenGenes type of entries +class TaxNode + attr_accessor :level, :name, :count, :score, :children + + # Method to initalize a TaxNode object. + def initialize(level, name, count, score) + @level = level # Taxonomic level e.g. phylum, class, etc + @name = name # Name of organism + @count = count # Number of times this organism was encountered + @score = score # Similarity score + @children = {} end - # Method to add to the TaxTree a GreenGenes entry. + # Method to add to the taxonomic tree a GreenGenes entry. def add_gg(s_id, score, size) - node = @tree + node = self s_id.scan(/ ([\w])__([^;]+)/) do level = expand_level($1) name = $2 if node.children[name].nil? - node.children[name] = TaxNode.new(level, name, 1, size, score) + node.children[name] = TaxNode.new(level, name, size * 1, size * score) else - node.children[name].count += 1 - node.children[name].score += score + node.children[name].count += size * 1 + node.children[name].score += size * score end node = node.children[name] end end - # Method to merge two TaxTrees. - def merge(tree) - node = @tree - - tree.flatten.each do |new_node| - next if new_node.level == 'root' - - new_node.score = new_node.score / new_node.count - new_node.count = 1 - - if node.children[new_node.name].nil? - node.children[new_node.name] = new_node + # Method to merge two TaxNodes. + def merge(node_new, node_old = self) + node_new.children.each do |name, child| + if node_old.children[name] + node_old.count += node_new.count + node_old.score += node_new.score else - node.children[new_node.name].count += new_node.count - node.children[new_node.name].score += new_node.score - node.children[new_node.name].size += new_node.size + node_old.children[name] = child end - node = node.children[new_node.name] + merge(child, node_old.children[name]) end end - # Method to flatten a TaxTree turning this into a list by recursive depth first traversal. - def flatten(node = @tree, list = []) - list << TaxNode.new(node.level, node.name, node.count, node.size, node.score) + # Method to flatten a taxonomic tree turning this into a list by recursive depth first traversal. + def flatten(node = self, list = []) + list << TaxNode.new(node.level, node.name, node.count, node.score) - node.children.each do |name, child| - list = flatten(child, list.dup) + node.children.each_value do |child| + flatten(child, list) end list end - # Method to recursively trim a TaxTree show that it only contains a unbranched tree, + # Method to recursively trim a taxonomic tree so that it only contains an unbranched tree, # which gives the lowest common ancestor. - def lowest_common_ancestor(node = @tree) + def lowest_common_ancestor(node = self) node.children = {} if node.children.size > 1 node.children.each do |name, child| @@ -101,7 +99,7 @@ class TaxTree end end - # Method for iterating over a TaxTree. + # Method for iterating over a taxonomic tree. def each self.flatten.each do |node| yield node @@ -110,6 +108,18 @@ class TaxTree self end + # Method to convert a TaxNode to a Biopiece record. + def to_bp + record = {} + record[:REC_TYPE] = "Classification" + record[:LEVEL] = @level + record[:NAME] = @name + record[:COUNT] = @count + record[:SCORE] = @count == 0 ? 0.0 : (@score / @count).round(2) + + record + end + private # Method containing a helper hash to expand the phylogenetic level name. @@ -129,32 +139,6 @@ class TaxTree hash[level] end - - # Nested class for TaxTree nodes. - class TaxNode - attr_accessor :level, :name, :count, :size, :score, :children - - def initialize(level, name, count, size, score) - @level = level - @name = name - @count = count - @size = size - @score = score - @children = {} - end - - # Method to convert a TaxNode to a Biopiece record. - def to_bp - record = {} - record[:REC_TYPE] = "Classification" - record[:LEVEL] = @level - record[:NAME] = @name - record[:COUNT] = @size - record[:SCORE] = @count == 0 ? 0.0 : (@score / @count).round(2) - - record - end - end end casts = [] @@ -163,7 +147,7 @@ casts << {:long=>'size', :short=>'s', :type=>'uint', :mandatory=>false, :default options = Biopieces.options_parse(ARGV, casts) -tax_tree = TaxTree.new +tax_tree = TaxNode.new("root", "Root", 0, 0.0) tax_hash = {} Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| @@ -179,21 +163,25 @@ Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| end end - tax_hash[record[:Q_ID]] = TaxTree.new unless tax_hash[record[:Q_ID]] - tax_hash[record[:Q_ID]].add_gg(record[:S_ID], record[:SCORE].to_f, size) + if options[:LCA] + tax_hash[record[:Q_ID]] = TaxNode.new("root", "Root", 0, 0.0) unless tax_hash[record[:Q_ID]] + tax_hash[record[:Q_ID]].add_gg(record[:S_ID], record[:SCORE].to_f, size) + else + tax_tree.add_gg(record[:S_ID], record[:SCORE].to_f, size) + end else output.puts record end end if options[:LCA] - tax_hash.each do |q_id, tree| + tax_hash.each_value do |tree| tree.lowest_common_ancestor end - end - tax_hash.each do |q_id, tree| - tax_tree.merge(tree) + tax_hash.each_value do |tree| + tax_tree.merge(tree) + end end tax_tree.each do |node|