From 612f00afe8baf1328b37b8537dc21f97414d0127 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Tue, 31 Jan 2012 20:25:25 +0000 Subject: [PATCH] major rewrite of align.rb git-svn-id: http://biopieces.googlecode.com/svn/trunk@1738 74ccb610-7750-0410-82ae-013aeee3265d --- code_ruby/lib/maasha/align.rb | 122 +++++++++++++++++----------------- 1 file changed, 61 insertions(+), 61 deletions(-) diff --git a/code_ruby/lib/maasha/align.rb b/code_ruby/lib/maasha/align.rb index cc38a83..85c3490 100755 --- a/code_ruby/lib/maasha/align.rb +++ b/code_ruby/lib/maasha/align.rb @@ -30,9 +30,8 @@ require 'maasha/fasta' class AlignError < StandardError; end; CONSENSUS = 0.20 -INDEL_RATIO = 0.5 FREQ_MIN = 2 -QUAL_BASE = 64 +QUAL_MIN = 20 ALPH_DNA = %w{A T C G} ALPH_AMBI = %w{A T C G M R W S Y K V H D B N} @@ -148,21 +147,19 @@ class Align # Method to create a consensus sequence from an Align object and # return a new Seq object with the consensus. def consensus - cons = Consensus.new(self.length) - - @entries.each { |entry| cons.add(entry) } - + cons = Consensus.new(@entries) cons.to_seq end # Method to pretty print an alignment from an Align object. def to_s cons = self.consensus - cons.mask_seq_soft!(20) unless cons.qual.nil? + cons.mask_seq_soft!(QUAL_MIN) unless cons.qual.nil? max_name = @entries.group_by { |entry| entry.seq_name.length }.max.first @entries.each do |entry| + entry.mask_seq_soft!(QUAL_MIN) unless entry.qual.nil? puts entry.seq_name + (" " * (max_name + 3 - entry.seq_name.length )) + entry.seq end @@ -175,53 +172,31 @@ class Align private class Consensus - # Method to initialize a Consensus object given a Seq object, which is added - # to the Consensus. - def initialize(length) - @length = length - - @freq = NArray.int(@length, ALPH_DNA.size) - @qual = NArray.int(@length, ALPH_DNA.size) - @mask_A = NArray.byte(@length).fill!(BIT_A) - @mask_T = NArray.byte(@length).fill!(BIT_T) - @mask_C = NArray.byte(@length).fill!(BIT_C) - @mask_G = NArray.byte(@length).fill!(BIT_G) - @has_qual = false - @count = 0 - end - - # Method to add a Seq entry to a Consensus object. - def add(entry) - @count += 1 - - seq = NArray.to_na(entry.seq.downcase.tr(TR_NUC, TR_HEX), "byte") - - mask_A = (seq & @mask_A) > 0 - mask_T = (seq & @mask_T) > 0 - mask_C = (seq & @mask_C) > 0 - mask_G = (seq & @mask_G) > 0 + # Method to initialize a Consensus object given a list of aligned Seq object. + def initialize(entries) + @entries = entries + @cols = entries.first.length + @rows = entries.size + @na_seq = NArray.byte(@cols, @rows) + @na_qual = NArray.byte(@cols, @rows) - @freq[true, ROW_A] += mask_A - @freq[true, ROW_T] += mask_T - @freq[true, ROW_C] += mask_C - @freq[true, ROW_G] += mask_G + na_add_entries - unless entry.qual.nil? - @has_qual = true + m1 = mask_high_qual + m2 = mask_conserved_columns + m3 = mask_conserved_residues - qual = NArray.to_na(entry.qual, "byte") - QUAL_BASE + @na_seq *= m3 + @na_qual *= m3 - @qual[true, ROW_A] += mask_A * qual - @qual[true, ROW_T] += mask_T * qual - @qual[true, ROW_C] += mask_C * qual - @qual[true, ROW_G] += mask_G * qual - end + @na_seq *= (m1 | m2) + @na_qual *= (m1 | m2) end def to_seq new_seq = Seq.new new_seq.seq = consensus_seq - new_seq.qual = consensus_qual if @has_qual + new_seq.qual = consensus_qual new_seq.type = "dna" new_seq @@ -229,30 +204,50 @@ class Align private - # Method to calculate a consensus sequence from a Consensus object. - def consensus_seq - fp = (@freq.to_type("float") / @count) > CONSENSUS + def na_add_entries + @entries.each_with_index do |entry, i| + @na_seq[true, i] = NArray.to_na(entry.seq.downcase.tr(TR_NUC, TR_HEX), "byte") + @na_qual[true, i] = NArray.to_na(entry.qual, "byte") - SCORE_ILLUMINA + end + end - fp *= @freq >= FREQ_MIN + def mask_high_qual + @na_qual > QUAL_MIN + end - cons = NArray.byte(@length) - cons |= fp[true, ROW_A] *= BIT_A - cons |= fp[true, ROW_T] *= BIT_T - cons |= fp[true, ROW_C] *= BIT_C - cons |= fp[true, ROW_G] *= BIT_G -# cons *= @freq.sum(1).to_type("float") / @count > INDEL_RATIO + def mask_conserved_columns + @na_seq * (@na_seq - @na_seq[true, 0]).sum(1).eq(0) > 0 + end - cons.to_s.tr!(TR_HEX, TR_NUC).upcase + def mask_conserved_residues + factor = (1 / @rows.to_f) + mask_A = @na_seq & BIT_A > 0 + mask_T = @na_seq & BIT_T > 0 + mask_C = @na_seq & BIT_C > 0 + mask_G = @na_seq & BIT_G > 0 + + mask_A = (mask_A * mask_A.sum(1)).to_f * factor > CONSENSUS + mask_T = (mask_T * mask_T.sum(1)).to_f * factor > CONSENSUS + mask_C = (mask_C * mask_C.sum(1)).to_f * factor > CONSENSUS + mask_G = (mask_G * mask_G.sum(1)).to_f * factor > CONSENSUS + + mask_A | mask_T | mask_C | mask_G end - def consensus_qual - q = qual_mean.max(1) + # Method to calculate a consensus sequence from a Consensus object. + def consensus_seq + cons = NArray.byte(@cols) + cons |= (@na_seq & BIT_A).max(1) + cons |= (@na_seq & BIT_T).max(1) + cons |= (@na_seq & BIT_C).max(1) + cons |= (@na_seq & BIT_G).max(1) - (q.to_type("byte") + QUAL_BASE).to_s + cons.to_s.tr!(TR_HEX, TR_NUC).upcase end - def qual_mean - @qual / (@freq + @freq.eq(0)) + # Method to calculate a consensus quality from a Consensus object. + def consensus_qual + (@na_qual.mean(1).round.to_type("byte") + SCORE_ILLUMINA).to_s end end end @@ -260,3 +255,8 @@ end __END__ + +cons |= ((@na_seq & BIT_A > 0).sum(1).to_type("float") * (1 / @rows.to_f) > CONSENSUS) * BIT_A +cons |= ((@na_seq & BIT_T > 0).sum(1).to_type("float") * (1 / @rows.to_f) > CONSENSUS) * BIT_T +cons |= ((@na_seq & BIT_C > 0).sum(1).to_type("float") * (1 / @rows.to_f) > CONSENSUS) * BIT_C +cons |= ((@na_seq & BIT_G > 0).sum(1).to_type("float") * (1 / @rows.to_f) > CONSENSUS) * BIT_G -- 2.39.5