X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bp_bin%2Fclassify_taxonomy;h=64303fb6b582432ae052b2ceebb36684c3443f84;hb=2f0fd91b461033529a4a72e161bd133252a22eb6;hp=12aabb89cdd537b4ecf2106d6802c2d4a610a191;hpb=0ad4208a00739c96c47726efa45a694a56607057;p=biopieces.git diff --git a/bp_bin/classify_taxonomy b/bp_bin/classify_taxonomy index 12aabb8..64303fb 100755 --- a/bp_bin/classify_taxonomy +++ b/bp_bin/classify_taxonomy @@ -31,149 +31,183 @@ require 'pp' require 'maasha/biopieces' -class TaxTable - def initialize - @q_hash = Hash.new { |h, k| h[k] = [] } +# Class containing methods to construct a taxonomic tree recursively +# for the classification of organisms. Currently only works with GreenGenes type of entries +class TaxNode + attr_accessor :level, :name, :count, :score, :children + + # Method to initalize a TaxNode object. + def initialize(level, name, count, score) + @level = level # Taxonomic level e.g. phylum, class, etc + @name = name # Name of organism + @count = count # Number of times this organism was encountered + @score = score # Similarity score + @children = {} end - # Method to add to the TaxTree a GreenGenes entry. - def add_gg(q_id, s_id, score) - hash = {} + # Method to add to the taxonomic tree a GreenGenes entry. + def add_gg(s_id, score, size) + node = self - s_id.scan(/ ([\w])__([^;]+)/) do - level = $1 + s_id.scan(/([\w])__([^;]+)/) do + level = expand_level($1) name = $2 - hash[expand_level(level).to_sym] = name - end + if node.children[name].nil? + node.children[name] = TaxNode.new(level, name, size * 1, size * score) + else + node.children[name].count += size * 1 + node.children[name].score += size * score + end - @q_hash[q_id.to_sym] << hash + node = node.children[name] + end end - def lowest_common_ancestor - @q_hash.each do |q_id, list| - count_hash = Hash.new { |h, k| h[k] = Hash.new(0) } - - list.each do |elem| - elem.each do |level, name| - count_hash[level][name] += 1 - end - end + # Method to merge two TaxNodes. + def merge(node_new, node_old = self) + node_new.children.each do |name, child| + if node_old.children[name] + node_old.children[name].count += child.count + node_old.children[name].score += child.score - list.each do |elem| - elem.each do |level, name| - elem.delete(level) if count_hash[level].size > 1 - end + merge(child, node_old.children[name]) + else + node_old.children[name] = child end end end - def uniq - lookup_hash = Hash.new - new_hash = Hash.new { |h, k| h[k] = [] } + # Method to flatten a taxonomic tree turning this into a list by recursive depth first traversal. + def flatten(node = self, list = []) + list << TaxNode.new(node.level, node.name, node.count, node.score) - @q_hash.each do |q_id, list| - list.each do |elem| - unless lookup_hash[elem.to_s] - new_hash[q_id] << elem - lookup_hash[elem.to_s] = true - end - end + node.children.each_value do |child| + flatten(child, list) end - @q_hash = new_hash + list end - def count - count_hash = Hash.new { |h, k| h[k] = Hash.new(0) } - new_hash = Hash.new { |h, k| h[k] = [] } + # Method to recursively remove branches in taxonomic tree where the child count is less than or + # equal to a given minimum. + def debranch(min_count = nil) + node = self - @q_hash.each do |q_id, list| - list.each do |elem| - elem.each do |level, name| - count_hash[level][name] += 1 - end - end + node.children.each do |name, child| + node.children.delete(name) if child.count <= min_count end - @q_hash.each do |q_id, list| - new_list = [] + node.children.each_value do |child| + child.debranch(min_count) + end + end - list.each do |elem| - new_elem = {} - - elem.each do |level, name| - new_elem[level] = name - new_elem["#{level}_count".to_sym] = count_hash[level][name] - end + # Method to recursively trim a taxonomic tree so that it only contains an unbranched tree, + # which gives the lowest common ancestor. + def lowest_common_ancestor + node = self - new_list << new_elem - end + node.children = {} if node.children.size > 1 - new_hash[q_id] = new_list + node.children.each_value do |child| + child.lowest_common_ancestor end - - @q_hash = new_hash end + # Method for iterating over a taxonomic tree. def each - @q_hash.each do |q_id, list| - list.each do |elem| - yield elem - end + self.flatten.each do |node| + yield node end + + self + end + + # Method to convert a TaxNode to a Biopiece record. + def to_bp + record = {} + record[:REC_TYPE] = "Classification" + record[:LEVEL] = @level + record[:NAME] = @name + record[:COUNT] = @count + record[:SCORE] = @count == 0 ? 0.0 : (@score / @count).round(2) + + record end private + # Method containing a helper hash to expand the phylogenetic level name. def expand_level(level) - case level - when 'd' then level = "domain" - when 'k' then level = "kingdom" - when 'p' then level = "phylum" - when 'c' then level = "class" - when 'o' then level = "order" - when 'f' then level = "family" - when 'g' then level = "genus" - when 's' then level = "species" - else - raise "unknown level: #{level}" - end - - level + hash = { + 'd' => "domain", + 'k' => "kingdom", + 'p' => "phylum", + 'c' => "class", + 'o' => "order", + 'f' => "family", + 'g' => "genus", + 's' => "species" + } + + raise "unknown level: #{level}" unless hash[level] + + hash[level] end end casts = [] -casts << {:long=>'LCA', :short=>'l', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'uniq', :short=>'u', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'LCA', :short=>'l', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'size', :short=>'s', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'min_count', :short=>'m', :type=>'uint', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} options = Biopieces.options_parse(ARGV, casts) -taxtab = TaxTable.new +tax_tree = TaxNode.new("root", "Root", 0, 0.0) +tax_hash = {} Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| input.each_record do |record| if record[:Q_ID] and record[:S_ID] and record[:SCORE] - taxtab.add_gg(record[:Q_ID], record[:S_ID], record[:SCORE]) + size = 1 + + if options[:size] + if record[:Q_ID].match(/_(\d+)$/) + size = $1.to_i +# else +# raise BiopiecesError, "Could not extract size from Q_ID: #{record[:Q_ID]}" + end + end + + if options[:LCA] + tax_hash[record[:Q_ID]] = TaxNode.new("root", "Root", 0, 0.0) unless tax_hash[record[:Q_ID]] + tax_hash[record[:Q_ID]].add_gg(record[:S_ID], record[:SCORE].to_f, size) + else + tax_tree.add_gg(record[:S_ID], record[:SCORE].to_f, size) + end + else + output.puts record end end - taxtab.lowest_common_ancestor if options[:LCA] - taxtab.uniq if options[:uniq] - taxtab.count - - taxtab.each do |tax| - record = {} + if options[:LCA] + tax_hash.each_value do |tree| + tree.debranch(options[:min_count]) if options[:min_count] + tree.lowest_common_ancestor + end - tax.each { |k, v| record[k.upcase] = v } + tax_hash.each_value do |tree| + tax_tree.merge(tree) + end + end - output.puts record + tax_tree.each do |node| + output.puts node.to_bp unless node.level == 'root' end end - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<