From: martinahansen Date: Fri, 2 Mar 2012 11:12:30 +0000 (+0000) Subject: major upgrade of align.rb for denoising X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=a33d48f1e332f62e31f65d7523df74448fbaa696;p=biopieces.git major upgrade of align.rb for denoising git-svn-id: http://biopieces.googlecode.com/svn/trunk@1758 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/code_ruby/lib/maasha/align.rb b/code_ruby/lib/maasha/align.rb index b758cd4..3068bb9 100755 --- a/code_ruby/lib/maasha/align.rb +++ b/code_ruby/lib/maasha/align.rb @@ -29,9 +29,6 @@ require 'maasha/fasta' class AlignError < StandardError; end; -CONS_MIN = 0.20 -FREQ_MIN = 2 -QUAL_MIN = 20 ALPH_DNA = %w{A T C G} ALPH_AMBI = %w{A T C G M R W S Y K V H D B N} @@ -150,26 +147,26 @@ class Align # return a new Seq object with the consensus. def consensus cons = Consensus.new(@entries, @options) - cons.to_seq + cons.consensus end # Method to pretty print an alignment from an Align object. def to_s - qual_min = @options[:quality_min] || QUAL_MIN + cons = Consensus.new(@entries, @options) + + entries = cons.each + cons_entry = cons.consensus - cons = self.consensus - cons.mask_seq_soft!(qual_min) unless cons.qual.nil? + max_name = entries.group_by { |entry| entry.seq_name.length }.max.first - max_name = @entries.group_by { |entry| entry.seq_name.length }.max.first + output = "" - @entries.each do |entry| - entry.mask_seq_soft!(qual_min) unless entry.qual.nil? - puts entry.seq_name + (" " * (max_name + 3 - entry.seq_name.length )) + entry.seq + entries.each do |entry| + output << entry.seq_name + (" " * (max_name + 3 - entry.seq_name.length )) + entry.seq + $/ end - output = "" - output << " " * (max_name + 3) + cons.seq + $/ - output << " " * (max_name + 3) + cons.qual.tr("[@-h]", " ..........ooooooooooOOOOOOOOOO") unless cons.qual.nil? + output << " " * (max_name + 3) + cons_entry.seq + $/ + output << " " * (max_name + 3) + cons_entry.qual.tr("[@-h]", " ..........ooooooooooOOOOOOOOOO") unless cons_entry.qual.nil? output end @@ -181,34 +178,79 @@ class Align @entries = entries @options = options - @cons_min = options[:consensus_min] || CONS_MIN - @freq_min = options[:frequency_min] || FREQ_MIN - @qual_min = options[:quality_min] || QUAL_MIN - - @cols = entries.first.length + @cols = entries.first.seq.length @rows = entries.size @has_qual = entries.first.qual.nil? ? false : true - @na_seq = NArray.byte(@cols, @rows) - @na_qual = NArray.byte(@cols, @rows) if @has_qual + @na_seq = NArray.byte(entries.first.length, entries.size) + @na_qual = NArray.byte(entries.first.length, entries.size) if @has_qual na_add_entries if @has_qual - m1 = mask_high_qual - m2 = mask_conserved_columns - m3 = mask_conserved_residues + if @options.has_key? :quality_min + mask = mask_quality_min + + @na_seq *= mask + @na_qual *= mask + end + + if @options.has_key? :quality_mean + mask = mask_quality_mean + + @na_seq *= mask + @na_qual *= mask + end + end + + if @options.has_key? :sequence_min + mask = mask_sequence_min + + @na_seq *= mask + @na_qual *= mask if @has_qual + end + + if @options.has_key? :gap_max + mask = mask_gap_max + + @na_seq *= mask + @na_qual *= mask if @has_qual + end - @na_seq *= m3 - @na_qual *= m3 + if @options.has_key? :residue_min + mask = mask_residue_min - @na_seq *= (m1 | m2) - @na_qual *= (m1 | m2) + @na_seq *= mask + @na_qual *= mask if @has_qual end end - def to_seq + # Method for iterating over the rows of an Align object and + # return Seq objects for each sequence. + def each + entries = [] + + (0 ... @rows).each do |i| + entry = Seq.new + entry.seq_name = @entries[i].seq_name + entry.seq = @na_seq[true, i].to_s.tr!(TR_HEX, TR_NUC).upcase + entry.qual = (@na_qual[true, i].to_type("byte") + SCORE_ILLUMINA).to_s if @has_qual + entry.type = @entries[i].type + + if block_given? + yield entry + else + entries << entry + end + end + + return entries unless block_given? + end + + # Method that returns a Sequence object with a consensus sequence + # for the entries in an Align object. + def consensus new_seq = Seq.new new_seq.seq = consensus_seq new_seq.qual = consensus_qual if @has_qual @@ -226,29 +268,61 @@ class Align end end - def mask_high_qual - @na_qual > @qual_min + # Mask that indicates which columns have more than sequence_min sequences. + # Columns with less than sequence_min are 0'ed, otherwise set to 1. + def mask_sequence_min + mask = NArray.byte(@cols, @rows) + 1 + mask *= ((@na_seq > 0).sum(1) >= @options[:sequence_min]) + mask end - def mask_conserved_columns - @na_seq * (@na_seq - @na_seq[true, 0]).sum(1).eq(0) > 0 - end + # Mask that indicates which residue frequencies that are above the residue_min. + # The residue frequencies are calculated for each column and residue type as the + # number of each residue type over the sum of all non-gap residues in that column. + # Positions with residues above the residue_min are indicated with 1. + def mask_residue_min + cons_min = @options[:residue_min] + factor = 1 / @na_seq.ne(0).sum(1).to_type("float") - def mask_conserved_residues - factor = (1 / @rows.to_f) mask_A = @na_seq & BIT_A > 0 mask_T = @na_seq & BIT_T > 0 mask_C = @na_seq & BIT_C > 0 mask_G = @na_seq & BIT_G > 0 - mask_A = (mask_A * mask_A.sum(1)).to_f * factor > @cons_min - mask_T = (mask_T * mask_T.sum(1)).to_f * factor > @cons_min - mask_C = (mask_C * mask_C.sum(1)).to_f * factor > @cons_min - mask_G = (mask_G * mask_G.sum(1)).to_f * factor > @cons_min + mask_A = (mask_A * mask_A.sum(1)) * factor >= cons_min + mask_T = (mask_T * mask_T.sum(1)) * factor >= cons_min + mask_C = (mask_C * mask_C.sum(1)) * factor >= cons_min + mask_G = (mask_G * mask_G.sum(1)) * factor >= cons_min mask_A | mask_T | mask_C | mask_G end + # Mask that indicates which columns contain fewer gaps than max_gap. + # Columns with more gaps are 0'ed, others are set to 1. + def mask_gap_max + mask = NArray.byte(@cols, @rows) + 1 + mask *= @na_seq.ne(0).sum(1).to_type("float") / @rows > @options[:gap_max] + + mask + end + + # Mask that indicates which residues in an alignment are above quality_min. + # Positions with subquality are 0'ed - all others are set to 1. + def mask_quality_min + @na_qual > @options[:quality_min] + end + + # Mask that indicates which columns have a quality mean above quality_mean which + # is the mean of all non-gap quality residues in that column. Columns with less then + # quality_mean are 0'ed, otherwise set to 1. + def mask_quality_mean + mask = NArray.byte(@cols, @rows) + 1 + residues = @na_seq.ne(0).to_type("int").sum(1) + quality = @na_qual.to_type("float").sum(1) + + mask * (quality / residues).round > @options[:quality_mean] + end + # Method to calculate a consensus sequence from a Consensus object. def consensus_seq cons = NArray.byte(@cols) @@ -267,11 +341,4 @@ class Align end end - - __END__ - -cons |= ((@na_seq & BIT_A > 0).sum(1).to_type("float") * (1 / @rows.to_f) > CONSENSUS) * BIT_A -cons |= ((@na_seq & BIT_T > 0).sum(1).to_type("float") * (1 / @rows.to_f) > CONSENSUS) * BIT_T -cons |= ((@na_seq & BIT_C > 0).sum(1).to_type("float") * (1 / @rows.to_f) > CONSENSUS) * BIT_C -cons |= ((@na_seq & BIT_G > 0).sum(1).to_type("float") * (1 / @rows.to_f) > CONSENSUS) * BIT_G