]> git.donarmstrong.com Git - biopieces.git/commitdiff
classify_taxonomy work
authormartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Wed, 20 Jun 2012 09:38:04 +0000 (09:38 +0000)
committermartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Wed, 20 Jun 2012 09:38:04 +0000 (09:38 +0000)
git-svn-id: http://biopieces.googlecode.com/svn/trunk@1842 74ccb610-7750-0410-82ae-013aeee3265d

bp_bin/classify_taxonomy

index 12aabb89cdd537b4ecf2106d6802c2d4a610a191..45da037e1d38c9aca3911ce721feadbccd9b7a31 100755 (executable)
 require 'pp'
 require 'maasha/biopieces'
 
-class TaxTable
+class TaxTree
   def initialize
-    @q_hash = Hash.new { |h, k| h[k] = [] }
+    @tree = TaxNode.new("root", "Root", 0, 0.0)
   end
 
   # Method to add to the TaxTree a GreenGenes entry.
-  def add_gg(q_id, s_id, score)
-    hash = {}
+  def add_gg(s_id, score)
+    node = @tree
 
     s_id.scan(/ ([\w])__([^;]+)/) do
-      level = $1
+      level = expand_level($1)
       name  = $2
 
-      hash[expand_level(level).to_sym] = name
-    end
+      if node.children[name].nil?
+        node.children[name] = TaxNode.new(level, name, 1, score)
+      else
+        node.children[name].count += 1
+        node.children[name].score += score
+      end
 
-    @q_hash[q_id.to_sym] << hash
+      node = node.children[name]
+    end
   end
 
-  def lowest_common_ancestor
-    @q_hash.each do |q_id, list|
-      count_hash = Hash.new { |h, k| h[k] = Hash.new(0) }
+  def merge(tree)
+    node = @tree
 
-      list.each do |elem|
-        elem.each do |level, name|
-          count_hash[level][name] += 1
-        end
-      end
+    tree.flatten.each do |new_node|
+      next if new_node.level == 'root'
 
-      list.each do |elem|
-        elem.each do |level, name|
-          elem.delete(level) if count_hash[level].size > 1
-        end
+      new_node.score = new_node.score / new_node.count
+      new_node.count = 1
+
+      if node.children[new_node.name].nil?
+        node.children[new_node.name] = new_node
+      else
+        node.children[new_node.name].count += new_node.count
+        node.children[new_node.name].score += new_node.score
       end
+
+      node = node.children[new_node.name]
     end
   end
 
-  def uniq
-    lookup_hash = Hash.new
-    new_hash    = Hash.new { |h, k| h[k] = [] }
+  def flatten(node = @tree, list = [])
+    list << TaxNode.new(node.level, node.name, node.count, node.score)
 
-    @q_hash.each do |q_id, list|
-      list.each do |elem|
-        unless lookup_hash[elem.to_s]
-          new_hash[q_id] << elem
-          lookup_hash[elem.to_s] = true
-        end
-      end
+    node.children.each do |name, child|
+      list = flatten(child, list.dup)
     end
 
-    @q_hash = new_hash
+    list
   end
 
-  def count
-    count_hash = Hash.new { |h, k| h[k] = Hash.new(0) }
-    new_hash   = Hash.new { |h, k| h[k] = [] }
+  def lowest_common_ancestor(node = @tree)
+    node.children = {} if node.children.size > 1
 
-    @q_hash.each do |q_id, list|
-      list.each do |elem|
-        elem.each do |level, name|
-          count_hash[level][name] += 1
-        end
-      end
+    node.children.each do |name, child|
+      lowest_common_ancestor(child)
     end
+  end
 
-    @q_hash.each do |q_id, list|
-      new_list = []
-
-      list.each do |elem|
-        new_elem = {}
-        
-        elem.each do |level, name|
-          new_elem[level] = name
-          new_elem["#{level}_count".to_sym] = count_hash[level][name]
-        end
+  def each
+    self.flatten.each do |node|
+      yield node
+    end
 
-        new_list << new_elem
-      end
+    self
+  end
 
-      new_hash[q_id] = new_list
+  def dump(node = @tree)
+    indent = 0
+
+    case node.level
+    when "kingdom" then indent = 2
+    when "phylum"  then indent = 4
+    when "class"   then indent = 6
+    when "order"   then indent = 8
+    when "family"  then indent = 10
+    when "genus"   then indent = 12
+    when "species" then indent = 14
     end
 
-    @q_hash = new_hash
-  end
+    puts (" " * indent) + "#{node.name} (#{node.level})" 
 
-  def each
-    @q_hash.each do |q_id, list|
-      list.each do |elem|
-        yield elem
-      end
+    node.children.each do |name, child|
+      dump(child)
     end
   end
 
   private
 
   def expand_level(level)
-    case level
-    when 'd' then level = "domain"
-    when 'k' then level = "kingdom"
-    when 'p' then level = "phylum"
-    when 'c' then level = "class"
-    when 'o' then level = "order"
-    when 'f' then level = "family"
-    when 'g' then level = "genus"
-    when 's' then level = "species"
-    else
-      raise "unknown level: #{level}"
+    hash = {
+      'd' => "domain",
+      'k' => "kingdom",
+      'p' => "phylum",
+      'c' => "class",
+      'o' => "order",
+      'f' => "family",
+      'g' => "genus",
+      's' => "species"
+    }
+
+    raise "unknown level: #{level}" unless hash[level]
+
+    hash[level]
+  end
+
+  class TaxNode
+    attr_accessor :level, :name, :count, :score, :children
+
+    def initialize(level, name, count, score)
+      @level    = level
+      @name     = name
+      @count    = count
+      @score    = score
+      @children = {}
     end
 
-    level
+    def to_bp
+      record = {}
+      record[:REC_TYPE] = "Classification"
+      record[:LEVEL]    = @level
+      record[:NAME]     = @name
+      record[:COUNT]    = @count
+      record[:SCORE]    = @count == 0 ? 0.0 : (@score / @count).round(2)
+
+      record
+    end
   end
 end
 
 casts = []
-casts << {:long=>'LCA',  :short=>'l', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
-casts << {:long=>'uniq', :short=>'u', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'LCA', :short=>'l', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
 
 options = Biopieces.options_parse(ARGV, casts)
 
-taxtab = TaxTable.new
+tax_tree = TaxTree.new
+tax_hash = {}
 
 Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
   input.each_record do |record|
     if record[:Q_ID] and record[:S_ID] and record[:SCORE]
-      taxtab.add_gg(record[:Q_ID], record[:S_ID], record[:SCORE])
+      tax_hash[record[:Q_ID]] = TaxTree.new unless tax_hash[record[:Q_ID]]
+      tax_hash[record[:Q_ID]].add_gg(record[:S_ID], record[:SCORE].to_f)
+    else
+      output.puts record
     end
   end
 
-  taxtab.lowest_common_ancestor if options[:LCA]
-  taxtab.uniq                   if options[:uniq]
-  taxtab.count
-
-  taxtab.each do |tax|
-    record = {}
+  if options[:LCA]
+    tax_hash.each do |q_id, tree|
+      tree.lowest_common_ancestor
+    end
+  end
 
-    tax.each { |k, v| record[k.upcase] = v }
+  tax_hash.each do |q_id, tree|
+    tax_tree.merge(tree)
+  end
 
-    output.puts record
+  tax_tree.each do |node|
+    output.puts node.to_bp unless node.level == 'root'
   end
 end
 
 
-
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<