From 0ad4208a00739c96c47726efa45a694a56607057 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Wed, 13 Jun 2012 13:50:15 +0000 Subject: [PATCH] added classify_taxonomy (unfinished biopiece) git-svn-id: http://biopieces.googlecode.com/svn/trunk@1841 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/classify_taxonomy | 180 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 180 insertions(+) create mode 100755 bp_bin/classify_taxonomy diff --git a/bp_bin/classify_taxonomy b/bp_bin/classify_taxonomy new file mode 100755 index 0000000..12aabb8 --- /dev/null +++ b/bp_bin/classify_taxonomy @@ -0,0 +1,180 @@ +#!/usr/bin/env ruby + +# Copyright (C) 2007-2012 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This program is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Classify records with taxonomy information. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +require 'pp' +require 'maasha/biopieces' + +class TaxTable + def initialize + @q_hash = Hash.new { |h, k| h[k] = [] } + end + + # Method to add to the TaxTree a GreenGenes entry. + def add_gg(q_id, s_id, score) + hash = {} + + s_id.scan(/ ([\w])__([^;]+)/) do + level = $1 + name = $2 + + hash[expand_level(level).to_sym] = name + end + + @q_hash[q_id.to_sym] << hash + end + + def lowest_common_ancestor + @q_hash.each do |q_id, list| + count_hash = Hash.new { |h, k| h[k] = Hash.new(0) } + + list.each do |elem| + elem.each do |level, name| + count_hash[level][name] += 1 + end + end + + list.each do |elem| + elem.each do |level, name| + elem.delete(level) if count_hash[level].size > 1 + end + end + end + end + + def uniq + lookup_hash = Hash.new + new_hash = Hash.new { |h, k| h[k] = [] } + + @q_hash.each do |q_id, list| + list.each do |elem| + unless lookup_hash[elem.to_s] + new_hash[q_id] << elem + lookup_hash[elem.to_s] = true + end + end + end + + @q_hash = new_hash + end + + def count + count_hash = Hash.new { |h, k| h[k] = Hash.new(0) } + new_hash = Hash.new { |h, k| h[k] = [] } + + @q_hash.each do |q_id, list| + list.each do |elem| + elem.each do |level, name| + count_hash[level][name] += 1 + end + end + end + + @q_hash.each do |q_id, list| + new_list = [] + + list.each do |elem| + new_elem = {} + + elem.each do |level, name| + new_elem[level] = name + new_elem["#{level}_count".to_sym] = count_hash[level][name] + end + + new_list << new_elem + end + + new_hash[q_id] = new_list + end + + @q_hash = new_hash + end + + def each + @q_hash.each do |q_id, list| + list.each do |elem| + yield elem + end + end + end + + private + + def expand_level(level) + case level + when 'd' then level = "domain" + when 'k' then level = "kingdom" + when 'p' then level = "phylum" + when 'c' then level = "class" + when 'o' then level = "order" + when 'f' then level = "family" + when 'g' then level = "genus" + when 's' then level = "species" + else + raise "unknown level: #{level}" + end + + level + end +end + +casts = [] +casts << {:long=>'LCA', :short=>'l', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'uniq', :short=>'u', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} + +options = Biopieces.options_parse(ARGV, casts) + +taxtab = TaxTable.new + +Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| + input.each_record do |record| + if record[:Q_ID] and record[:S_ID] and record[:SCORE] + taxtab.add_gg(record[:Q_ID], record[:S_ID], record[:SCORE]) + end + end + + taxtab.lowest_common_ancestor if options[:LCA] + taxtab.uniq if options[:uniq] + taxtab.count + + taxtab.each do |tax| + record = {} + + tax.each { |k, v| record[k.upcase] = v } + + output.puts record + end +end + + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ -- 2.39.5