From: martinahansen Date: Thu, 12 Jan 2012 11:09:58 +0000 (+0000) Subject: added align.rb to ruby code X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=61c5383f9d666a62e3683a08d276c847e626255b;p=biopieces.git added align.rb to ruby code git-svn-id: http://biopieces.googlecode.com/svn/trunk@1724 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/code_ruby/lib/maasha/align.rb b/code_ruby/lib/maasha/align.rb new file mode 100755 index 0000000..13fb96f --- /dev/null +++ b/code_ruby/lib/maasha/align.rb @@ -0,0 +1,239 @@ +# Copyright (C) 2007-2011 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This software is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +require 'pp' +require 'open3' +require 'narray' +require 'maasha/fasta' + +class AlignError < StandardError; end; + +CONSENSUS = 20 +INDEL_RATIO = 0.5 +QUAL_MIN = 15 +QUAL_BASE = 64 +ALPH_DNA = %w{A T C G} +ALPH_AMBI = %w{A T C G M R W S Y K V H D B N} + +BIT_INDEL = 0 +BIT_A = 1 << 0 +BIT_T = 1 << 1 +BIT_C = 1 << 2 +BIT_G = 1 << 3 +BIT_M = BIT_A | BIT_C +BIT_R = BIT_A | BIT_G +BIT_W = BIT_A | BIT_T +BIT_S = BIT_C | BIT_G +BIT_Y = BIT_C | BIT_T +BIT_K = BIT_G | BIT_T +BIT_V = BIT_A | BIT_C | BIT_G +BIT_H = BIT_A | BIT_C | BIT_T +BIT_D = BIT_A | BIT_G | BIT_T +BIT_B = BIT_C | BIT_G | BIT_T +BIT_N = BIT_G | BIT_A | BIT_T | BIT_C + +BITMAP = [ + BIT_A, + BIT_T, + BIT_C, + BIT_G, + BIT_M, + BIT_R, + BIT_W, + BIT_S, + BIT_Y, + BIT_K, + BIT_V, + BIT_H, + BIT_D, + BIT_B, + BIT_N +] + +TR_NUC = "-" + ALPH_AMBI.join("").downcase +TR_HEX = [BIT_INDEL].pack("C") + BITMAP.pack("C*") + +ROW_A = 0 +ROW_T = 1 +ROW_C = 2 +ROW_G = 3 + +# Adding an initialize method to the existing Fasta class. +class Fasta + def initialize(io) + @io = io + end +end + +# Class for aligning sequences. +class Align + # Class method to align sequences in a list of Seq objects and + # return these as a new list of Seq objects. + def self.muscle(entries) + cmd = "muscle" + aligned = [] + + Open3.popen3(cmd) do |stdin, stdout, stderr| + entries.each { |entry| stdin.puts entry.to_fasta } + stdin.close + + fa = Fasta.new(stdout) + + fa.each do |align| + if block_given? + yield align + else + aligned << align + end + end + end + + return aligned unless block_given? + end + + # Class method to create a consensus sequence from a list of Seq objects and + # return a new Seq object with the consensus. + def self.consensus(entries) + cons = Consensus.new(entries.first.length) + + entries.each { |entry| cons.add(entry) } + + cons.to_seq + end + + class Consensus + # Method to initialize a Consensus object given a Seq object, which is added + # to the Consensus. + def initialize(length) + @length = length + + @count = NArray.int(@length) + @freq = NArray.int(@length, ALPH_DNA.size) + @qual = NArray.int(@length, ALPH_DNA.size) + @mask_A = NArray.byte(@length).fill!(BIT_A) + @mask_T = NArray.byte(@length).fill!(BIT_T) + @mask_C = NArray.byte(@length).fill!(BIT_C) + @mask_G = NArray.byte(@length).fill!(BIT_G) + @has_qual = false + end + + # Method to add a Seq entry to a Consensus object. + def add(entry) + seq = NArray.to_na(entry.seq.downcase.tr(TR_NUC, TR_HEX), "byte") + + @count += seq > 0 + + mask_A = (seq & @mask_A) > 0 + mask_T = (seq & @mask_T) > 0 + mask_C = (seq & @mask_C) > 0 + mask_G = (seq & @mask_G) > 0 + + @freq[true, ROW_A] += mask_A + @freq[true, ROW_T] += mask_T + @freq[true, ROW_C] += mask_C + @freq[true, ROW_G] += mask_G + + unless entry.qual.nil? + @has_qual = true + + qual = NArray.to_na(entry.qual, "byte") - QUAL_BASE + + @qual[true, ROW_A] += mask_A * qual + @qual[true, ROW_T] += mask_T * qual + @qual[true, ROW_C] += mask_C * qual + @qual[true, ROW_G] += mask_G * qual + end + end + + def to_seq + #qual_filter + new_seq = Seq.new + new_seq.seq = consensus_seq + new_seq.qual = consensus_qual if @has_qual + new_seq.type = "dna" + + new_seq + end + + private + + # Method that calculates the sum for each column except the indel row and + # returns the sum in an NArray. + def freq_total + @freq.sum(1) + end + + # Method that locates the maximum value for each column and + # returns this in an NArray. + def freq_max + @freq.max(1) + end + + def qual_mean + freq = @freq[true, [0 ... ALPH_DNA.size]] + @qual / (freq + freq.eq(0)) + end + + def freq_percent + (@freq / freq_total.to_type("float") * 100).round + end + + # Method to calculate a consensus sequence from a Consensus object. + def consensus_seq + fp = freq_percent > CONSENSUS + cons = NArray.byte(@length) + cons |= fp[true, ROW_A] *= BIT_A + cons |= fp[true, ROW_T] *= BIT_T + cons |= fp[true, ROW_C] *= BIT_C + cons |= fp[true, ROW_G] *= BIT_G + + cons *= @count > @count.max * INDEL_RATIO + + cons.to_s.tr!(TR_HEX, TR_NUC).upcase + end + + def consensus_qual + q = (@qual / (@count + @count.eq(0))).max(1) + + (q.to_type("byte") + QUAL_BASE).to_s + end + + def qual_filter + filter = qual_mean > QUAL_MIN + + # pp @freq + # pp @qual + + # @freq *= filter + # @qual *= filter + + # pp @freq + # pp @qual + end + end +end + + + +__END__