3 # Copyright (C) 2007-2012 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
21 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
23 # This program is part of the Biopieces framework (www.biopieces.org).
25 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
27 # Classify records with taxonomy information.
29 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
32 require 'maasha/biopieces'
36 @tree = TaxNode.new("root", "Root", 0, 0, 0.0)
39 # Method to add to the TaxTree a GreenGenes entry.
40 def add_gg(s_id, score, size)
43 s_id.scan(/ ([\w])__([^;]+)/) do
44 level = expand_level($1)
47 if node.children[name].nil?
48 node.children[name] = TaxNode.new(level, name, 1, size, score)
50 node.children[name].count += 1
51 node.children[name].score += score
54 node = node.children[name]
61 tree.flatten.each do |new_node|
62 next if new_node.level == 'root'
64 new_node.score = new_node.score / new_node.count
67 if node.children[new_node.name].nil?
68 node.children[new_node.name] = new_node
70 node.children[new_node.name].count += new_node.count
71 node.children[new_node.name].score += new_node.score
72 node.children[new_node.name].size += new_node.size
75 node = node.children[new_node.name]
79 def flatten(node = @tree, list = [])
80 list << TaxNode.new(node.level, node.name, node.count, node.size, node.score)
82 node.children.each do |name, child|
83 list = flatten(child, list.dup)
89 def lowest_common_ancestor(node = @tree)
90 node.children = {} if node.children.size > 1
92 node.children.each do |name, child|
93 lowest_common_ancestor(child)
98 self.flatten.each do |node|
107 def expand_level(level)
119 raise "unknown level: #{level}" unless hash[level]
125 attr_accessor :level, :name, :count, :size, :score, :children
127 def initialize(level, name, count, size, score)
138 record[:REC_TYPE] = "Classification"
139 record[:LEVEL] = @level
140 record[:NAME] = @name
141 record[:COUNT] = @size
142 record[:SCORE] = @count == 0 ? 0.0 : (@score / @count).round(2)
150 casts << {:long=>'LCA', :short=>'l', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
151 casts << {:long=>'size', :short=>'s', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
153 options = Biopieces.options_parse(ARGV, casts)
155 tax_tree = TaxTree.new
158 Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
159 input.each_record do |record|
160 if record[:Q_ID] and record[:S_ID] and record[:SCORE]
164 if record[:Q_ID].match(/_(\d+)$/)
167 raise BiopiecesError, "Could not extract size from Q_ID: #{record[:Q_ID]}"
171 tax_hash[record[:Q_ID]] = TaxTree.new unless tax_hash[record[:Q_ID]]
172 tax_hash[record[:Q_ID]].add_gg(record[:S_ID], record[:SCORE].to_f, size)
179 tax_hash.each do |q_id, tree|
180 tree.lowest_common_ancestor
184 tax_hash.each do |q_id, tree|
188 tax_tree.each do |node|
189 output.puts node.to_bp unless node.level == 'root'
194 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<