From: martinahansen Date: Fri, 27 Jan 2012 13:09:20 +0000 (+0000) Subject: final touches to align.rb X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=64d9b883ac41c0e7263aed9d30bd302615764776;p=biopieces.git final touches to align.rb git-svn-id: http://biopieces.googlecode.com/svn/trunk@1735 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/code_ruby/lib/maasha/align.rb b/code_ruby/lib/maasha/align.rb index 411e459..cc38a83 100755 --- a/code_ruby/lib/maasha/align.rb +++ b/code_ruby/lib/maasha/align.rb @@ -29,18 +29,19 @@ require 'maasha/fasta' class AlignError < StandardError; end; -CONSENSUS = 20 +CONSENSUS = 0.20 INDEL_RATIO = 0.5 -QUAL_MIN = 15 +FREQ_MIN = 2 QUAL_BASE = 64 ALPH_DNA = %w{A T C G} ALPH_AMBI = %w{A T C G M R W S Y K V H D B N} BIT_INDEL = 0 -BIT_A = 1 << 0 -BIT_T = 1 << 1 -BIT_C = 1 << 2 -BIT_G = 1 << 3 +# BIT_A = 1 << 0 +# BIT_T = 1 << 1 +# BIT_C = 1 << 2 +# BIT_G = 1 << 3 + BIT_M = BIT_A | BIT_C BIT_R = BIT_A | BIT_G BIT_W = BIT_A | BIT_T @@ -91,6 +92,8 @@ end # Class for aligning sequences. class Align + attr_reader :entries + # Class method to align sequences in a list of Seq objects and # return these as a new list of Seq objects. def self.muscle(entries, verbose = false) @@ -124,26 +127,59 @@ class Align end end - result + self.new(result) + end + + # Method to initialize an Align object with a list of aligned Seq objects. + def initialize(entries) + @entries = entries + end + + # Method that returns the length of the alignment. + def length + @entries.first.length + end + + # Method that returns the number of members or sequences in the alignment. + def members + @entries.size end - # Class method to create a consensus sequence from a list of Seq objects and + # Method to create a consensus sequence from an Align object and # return a new Seq object with the consensus. - def self.consensus(entries) - cons = Consensus.new(entries.first.length) + def consensus + cons = Consensus.new(self.length) - entries.each { |entry| cons.add(entry) } + @entries.each { |entry| cons.add(entry) } cons.to_seq end + # Method to pretty print an alignment from an Align object. + def to_s + cons = self.consensus + cons.mask_seq_soft!(20) unless cons.qual.nil? + + max_name = @entries.group_by { |entry| entry.seq_name.length }.max.first + + @entries.each do |entry| + puts entry.seq_name + (" " * (max_name + 3 - entry.seq_name.length )) + entry.seq + end + + output = "" + output << " " * (max_name + 3) + cons.seq + $/ + output << " " * (max_name + 3) + cons.qual.tr("[@-h]", " ..........ooooooooooOOOOOOOOOO") unless cons.qual.nil? + output + end + + private + class Consensus # Method to initialize a Consensus object given a Seq object, which is added # to the Consensus. def initialize(length) @length = length - @count = NArray.int(@length) @freq = NArray.int(@length, ALPH_DNA.size) @qual = NArray.int(@length, ALPH_DNA.size) @mask_A = NArray.byte(@length).fill!(BIT_A) @@ -151,13 +187,14 @@ class Align @mask_C = NArray.byte(@length).fill!(BIT_C) @mask_G = NArray.byte(@length).fill!(BIT_G) @has_qual = false + @count = 0 end # Method to add a Seq entry to a Consensus object. def add(entry) - seq = NArray.to_na(entry.seq.downcase.tr(TR_NUC, TR_HEX), "byte") + @count += 1 - @count += seq > 0 + seq = NArray.to_na(entry.seq.downcase.tr(TR_NUC, TR_HEX), "byte") mask_A = (seq & @mask_A) > 0 mask_T = (seq & @mask_T) > 0 @@ -182,7 +219,6 @@ class Align end def to_seq - #qual_filter new_seq = Seq.new new_seq.seq = consensus_seq new_seq.qual = consensus_qual if @has_qual @@ -193,58 +229,30 @@ class Align private - # Method that calculates the sum for each column except the indel row and - # returns the sum in an NArray. - def freq_total - @freq.sum(1) - end - - # Method that locates the maximum value for each column and - # returns this in an NArray. - def freq_max - @freq.max(1) - end - - def qual_mean - freq = @freq[true, [0 ... ALPH_DNA.size]] - @qual / (freq + freq.eq(0)) - end - - def freq_percent - (@freq / freq_total.to_type("float") * 100).round - end - # Method to calculate a consensus sequence from a Consensus object. def consensus_seq - fp = freq_percent > CONSENSUS + fp = (@freq.to_type("float") / @count) > CONSENSUS + + fp *= @freq >= FREQ_MIN + cons = NArray.byte(@length) cons |= fp[true, ROW_A] *= BIT_A cons |= fp[true, ROW_T] *= BIT_T cons |= fp[true, ROW_C] *= BIT_C cons |= fp[true, ROW_G] *= BIT_G - - cons *= @count > @count.max * INDEL_RATIO +# cons *= @freq.sum(1).to_type("float") / @count > INDEL_RATIO cons.to_s.tr!(TR_HEX, TR_NUC).upcase end def consensus_qual - q = (@qual / (@count + @count.eq(0))).max(1) + q = qual_mean.max(1) (q.to_type("byte") + QUAL_BASE).to_s end - def qual_filter - filter = qual_mean > QUAL_MIN - - # pp @freq - # pp @qual - - # @freq *= filter - # @qual *= filter - - # pp @freq - # pp @qual + def qual_mean + @qual / (@freq + @freq.eq(0)) end end end