From 8175fe9f47765cdbe1a5f51787a086198c25f151 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Wed, 20 Jun 2012 09:38:04 +0000 Subject: [PATCH] classify_taxonomy work git-svn-id: http://biopieces.googlecode.com/svn/trunk@1842 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/classify_taxonomy | 194 ++++++++++++++++++++++----------------- 1 file changed, 109 insertions(+), 85 deletions(-) diff --git a/bp_bin/classify_taxonomy b/bp_bin/classify_taxonomy index 12aabb8..45da037 100755 --- a/bp_bin/classify_taxonomy +++ b/bp_bin/classify_taxonomy @@ -31,149 +31,173 @@ require 'pp' require 'maasha/biopieces' -class TaxTable +class TaxTree def initialize - @q_hash = Hash.new { |h, k| h[k] = [] } + @tree = TaxNode.new("root", "Root", 0, 0.0) end # Method to add to the TaxTree a GreenGenes entry. - def add_gg(q_id, s_id, score) - hash = {} + def add_gg(s_id, score) + node = @tree s_id.scan(/ ([\w])__([^;]+)/) do - level = $1 + level = expand_level($1) name = $2 - hash[expand_level(level).to_sym] = name - end + if node.children[name].nil? + node.children[name] = TaxNode.new(level, name, 1, score) + else + node.children[name].count += 1 + node.children[name].score += score + end - @q_hash[q_id.to_sym] << hash + node = node.children[name] + end end - def lowest_common_ancestor - @q_hash.each do |q_id, list| - count_hash = Hash.new { |h, k| h[k] = Hash.new(0) } + def merge(tree) + node = @tree - list.each do |elem| - elem.each do |level, name| - count_hash[level][name] += 1 - end - end + tree.flatten.each do |new_node| + next if new_node.level == 'root' - list.each do |elem| - elem.each do |level, name| - elem.delete(level) if count_hash[level].size > 1 - end + new_node.score = new_node.score / new_node.count + new_node.count = 1 + + if node.children[new_node.name].nil? + node.children[new_node.name] = new_node + else + node.children[new_node.name].count += new_node.count + node.children[new_node.name].score += new_node.score end + + node = node.children[new_node.name] end end - def uniq - lookup_hash = Hash.new - new_hash = Hash.new { |h, k| h[k] = [] } + def flatten(node = @tree, list = []) + list << TaxNode.new(node.level, node.name, node.count, node.score) - @q_hash.each do |q_id, list| - list.each do |elem| - unless lookup_hash[elem.to_s] - new_hash[q_id] << elem - lookup_hash[elem.to_s] = true - end - end + node.children.each do |name, child| + list = flatten(child, list.dup) end - @q_hash = new_hash + list end - def count - count_hash = Hash.new { |h, k| h[k] = Hash.new(0) } - new_hash = Hash.new { |h, k| h[k] = [] } + def lowest_common_ancestor(node = @tree) + node.children = {} if node.children.size > 1 - @q_hash.each do |q_id, list| - list.each do |elem| - elem.each do |level, name| - count_hash[level][name] += 1 - end - end + node.children.each do |name, child| + lowest_common_ancestor(child) end + end - @q_hash.each do |q_id, list| - new_list = [] - - list.each do |elem| - new_elem = {} - - elem.each do |level, name| - new_elem[level] = name - new_elem["#{level}_count".to_sym] = count_hash[level][name] - end + def each + self.flatten.each do |node| + yield node + end - new_list << new_elem - end + self + end - new_hash[q_id] = new_list + def dump(node = @tree) + indent = 0 + + case node.level + when "kingdom" then indent = 2 + when "phylum" then indent = 4 + when "class" then indent = 6 + when "order" then indent = 8 + when "family" then indent = 10 + when "genus" then indent = 12 + when "species" then indent = 14 end - @q_hash = new_hash - end + puts (" " * indent) + "#{node.name} (#{node.level})" - def each - @q_hash.each do |q_id, list| - list.each do |elem| - yield elem - end + node.children.each do |name, child| + dump(child) end end private def expand_level(level) - case level - when 'd' then level = "domain" - when 'k' then level = "kingdom" - when 'p' then level = "phylum" - when 'c' then level = "class" - when 'o' then level = "order" - when 'f' then level = "family" - when 'g' then level = "genus" - when 's' then level = "species" - else - raise "unknown level: #{level}" + hash = { + 'd' => "domain", + 'k' => "kingdom", + 'p' => "phylum", + 'c' => "class", + 'o' => "order", + 'f' => "family", + 'g' => "genus", + 's' => "species" + } + + raise "unknown level: #{level}" unless hash[level] + + hash[level] + end + + class TaxNode + attr_accessor :level, :name, :count, :score, :children + + def initialize(level, name, count, score) + @level = level + @name = name + @count = count + @score = score + @children = {} end - level + def to_bp + record = {} + record[:REC_TYPE] = "Classification" + record[:LEVEL] = @level + record[:NAME] = @name + record[:COUNT] = @count + record[:SCORE] = @count == 0 ? 0.0 : (@score / @count).round(2) + + record + end end end casts = [] -casts << {:long=>'LCA', :short=>'l', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} -casts << {:long=>'uniq', :short=>'u', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'LCA', :short=>'l', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} options = Biopieces.options_parse(ARGV, casts) -taxtab = TaxTable.new +tax_tree = TaxTree.new +tax_hash = {} Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| input.each_record do |record| if record[:Q_ID] and record[:S_ID] and record[:SCORE] - taxtab.add_gg(record[:Q_ID], record[:S_ID], record[:SCORE]) + tax_hash[record[:Q_ID]] = TaxTree.new unless tax_hash[record[:Q_ID]] + tax_hash[record[:Q_ID]].add_gg(record[:S_ID], record[:SCORE].to_f) + else + output.puts record end end - taxtab.lowest_common_ancestor if options[:LCA] - taxtab.uniq if options[:uniq] - taxtab.count - - taxtab.each do |tax| - record = {} + if options[:LCA] + tax_hash.each do |q_id, tree| + tree.lowest_common_ancestor + end + end - tax.each { |k, v| record[k.upcase] = v } + tax_hash.each do |q_id, tree| + tax_tree.merge(tree) + end - output.puts record + tax_tree.each do |node| + output.puts node.to_bp unless node.level == 'root' end end - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -- 2.39.5