X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=code_ruby%2Flib%2Fmaasha%2Falign.rb;h=c139fe8d89c30a773df458f23e730d2a73e992c4;hb=c4b49c5ce1ed3b46ae37e6ebd73c21d67d6d4810;hp=12cb5e011d5453c0131ae68c9e114b93c86fed77;hpb=f6014127c8bd48a3c3ff82bf874d23b874a57c8b;p=biopieces.git diff --git a/code_ruby/lib/maasha/align.rb b/code_ruby/lib/maasha/align.rb index 12cb5e0..c139fe8 100755 --- a/code_ruby/lib/maasha/align.rb +++ b/code_ruby/lib/maasha/align.rb @@ -25,22 +25,20 @@ require 'pp' require 'open3' require 'narray' +require 'maasha/align/pair' require 'maasha/fasta' class AlignError < StandardError; end; -CONSENSUS = 20 -INDEL_RATIO = 0.5 -QUAL_MIN = 15 -QUAL_BASE = 64 -ALPH_DNA = %w{A T C G} -ALPH_AMBI = %w{A T C G M R W S Y K V H D B N} +ALPH_DNA = %w{A T C G} +ALPH_AMBI = %w{A T C G M R W S Y K V H D B N} BIT_INDEL = 0 BIT_A = 1 << 0 BIT_T = 1 << 1 BIT_C = 1 << 2 BIT_G = 1 << 3 + BIT_M = BIT_A | BIT_C BIT_R = BIT_A | BIT_G BIT_W = BIT_A | BIT_T @@ -79,31 +77,42 @@ ROW_T = 1 ROW_C = 2 ROW_G = 3 -# Adding an initialize method to the existing Fasta class. class Fasta def initialize(io) @io = io end + + def close + @io.close + end end # Class for aligning sequences. class Align + attr_accessor :options + attr_reader :entries + + include PairAlign + # Class method to align sequences in a list of Seq objects and # return these as a new list of Seq objects. - def self.muscle(entries) - has_qual = false + def self.muscle(entries, verbose = false) result = [] index = {} - Open3.popen3("muscle") do |stdin, stdout, stderr| + Open3.popen3("muscle", "-quiet") do |stdin, stdout, stderr| entries.each do |entry| - index[entry.seq_name] = entry + raise AlignError, "Duplicate sequence name: #{entry.seq_name}" if index[entry.seq_name] + + index[entry.seq_name] = entry.dup stdin.puts entry.to_fasta end stdin.close + stderr.each_line { |line| $stderr.puts line } if verbose + aligned_entries = Fasta.new(stdout) aligned_entries.each do |fa_entry| @@ -111,72 +120,134 @@ class Align fa_entry.seq.scan(/-+/) do |m| fq_entry.seq = fq_entry.seq[0 ... $`.length] + ('-' * m.length) + fq_entry.seq[$`.length .. -1] - fq_entry.qual = fq_entry.qual[0 ... $`.length] + ('@' * m.length) + fq_entry.qual[$`.length .. -1] + fq_entry.qual = fq_entry.qual[0 ... $`.length] + ('@' * m.length) + fq_entry.qual[$`.length .. -1] unless fq_entry.qual.nil? end result << fq_entry end end - result + self.new(result) + end + + # Class method to create a pairwise alignment of two given Seq objects. The + # alignment is created by casting a search space the size of the sequences + # and save the best scoring match between the sequences and recurse into + # the left and right search spaced around this match. When all search spaces + # are exhausted the saved matches are used to insert gaps in the sequences. + def self.pair(q_entry, s_entry) + AlignPair.align(q_entry, s_entry) + + self.new([q_entry, s_entry]) + end + + # Method to initialize an Align object with a list of aligned Seq objects. + def initialize(entries, options = {}) + @entries = entries + @options = options + end + + # Method that returns the length of the alignment. + def length + @entries.first.length + end + + # Method that returns the number of members or sequences in the alignment. + def members + @entries.size end - # Class method to create a consensus sequence from a list of Seq objects and + # Method that returns the identity of an alignment with two members. + def identity + if self.members != 2 + raise AlignError "Bad number of members for similarity calculation: #{self.members}" + end + + na1 = NArray.to_na(@entries[0].seq.upcase, "byte") + na2 = NArray.to_na(@entries[1].seq.upcase, "byte") + + shared = (na1 - na2).count_false + total = (@entries[0].length < @entries[1].length) ? @entries[0].length : @entries[1].length + identity = shared.to_f / total + + identity + end + + # Method to create a consensus sequence from an Align object and # return a new Seq object with the consensus. - def self.consensus(entries) - cons = Consensus.new(entries.first.length) + def consensus + cons = Consensus.new(@entries, @options) + cons.consensus + end + + # Method to pretty print an alignment from an Align object. + def to_s + cons = Consensus.new(@entries, @options) + cons.mask_entries! + + max_name = @entries.group_by { |entry| entry.seq_name.length }.max.first - entries.each { |entry| cons.add(entry) } + output = "" - cons.to_seq + @entries.each do |entry| + output << entry.seq_name + (" " * (max_name + 3 - entry.seq_name.length )) + entry.seq + $/ + end + + cons_entry = cons.consensus + + output << " " * (max_name + 3) + cons_entry.seq + output << $/ + " " * (max_name + 3) + cons_entry.qual.tr("[@-h]", " ..........ooooooooooOOOOOOOOOO") unless cons_entry.qual.nil? + output end + # Method for iterating each of the aligned sequences. + def each + if block_given? + @entries.each { |entry| yield entry } + else + return @entries + end + end + + private + class Consensus - # Method to initialize a Consensus object given a Seq object, which is added - # to the Consensus. - def initialize(length) - @length = length - - @count = NArray.int(@length) - @freq = NArray.int(@length, ALPH_DNA.size) - @qual = NArray.int(@length, ALPH_DNA.size) - @mask_A = NArray.byte(@length).fill!(BIT_A) - @mask_T = NArray.byte(@length).fill!(BIT_T) - @mask_C = NArray.byte(@length).fill!(BIT_C) - @mask_G = NArray.byte(@length).fill!(BIT_G) - @has_qual = false - end - - # Method to add a Seq entry to a Consensus object. - def add(entry) - seq = NArray.to_na(entry.seq.downcase.tr(TR_NUC, TR_HEX), "byte") - - @count += seq > 0 - - mask_A = (seq & @mask_A) > 0 - mask_T = (seq & @mask_T) > 0 - mask_C = (seq & @mask_C) > 0 - mask_G = (seq & @mask_G) > 0 - - @freq[true, ROW_A] += mask_A - @freq[true, ROW_T] += mask_T - @freq[true, ROW_C] += mask_C - @freq[true, ROW_G] += mask_G - - unless entry.qual.nil? - @has_qual = true - - qual = NArray.to_na(entry.qual, "byte") - QUAL_BASE - - @qual[true, ROW_A] += mask_A * qual - @qual[true, ROW_T] += mask_T * qual - @qual[true, ROW_C] += mask_C * qual - @qual[true, ROW_G] += mask_G * qual + # Method to initialize a Consensus object given a list of aligned Seq object. + def initialize(entries, options) + @entries = entries + @options = options + + @cols = entries.first.seq.length + @rows = entries.size + + @has_qual = entries.first.qual.nil? ? false : true + + @na_seq = NArray.byte(@cols, @rows) + @na_qual = NArray.byte(@cols, @rows) if @has_qual + + na_add_entries + consensus_calc + end + + # Method that lowercase residues that have been removed during + # the determination of the consensus sequence. + def mask_entries! + na_seq = NArray.byte(@cols, @rows) + + @entries.each_with_index do |entry, i| + na_seq[true, i] = NArray.to_na(entry.seq.upcase, "byte") + end + + na_seq += ((na_seq.ne('-'.ord) - @na_seq.ne(0)) * ' '.ord) + + @entries.each_with_index do |entry, i| + entry.seq = na_seq[true, i].to_s end end - def to_seq - #qual_filter + # Method that returns a Sequence object with a consensus sequence + # for the entries in an Align object. + def consensus new_seq = Seq.new new_seq.seq = consensus_seq new_seq.qual = consensus_qual if @has_qual @@ -187,62 +258,126 @@ class Align private - # Method that calculates the sum for each column except the indel row and - # returns the sum in an NArray. - def freq_total - @freq.sum(1) + # Method to add a Seq entry object to the two NArrays; @na_seq and @na_qual + def na_add_entries + @entries.each_with_index do |entry, i| + @na_seq[true, i] = NArray.to_na(entry.seq.downcase.tr(TR_NUC, TR_HEX), "byte") + @na_qual[true, i] = NArray.to_na(entry.qual, "byte") - Seq::SCORE_BASE if @has_qual + end end - # Method that locates the maximum value for each column and - # returns this in an NArray. - def freq_max - @freq.max(1) + # Method to calculate a consensus sequence from a list of sequenced stored in two + # NArrays. + def consensus_calc + if @has_qual + if @options[:quality_min] + mask = mask_quality_min + + @na_seq *= mask + @na_qual *= mask + end + + if @options[:quality_mean] + mask = mask_quality_mean + + @na_seq *= mask + @na_qual *= mask + end + end + + if @options[:sequence_min] + mask = mask_sequence_min + + @na_seq *= mask + @na_qual *= mask if @has_qual + end + + if @options[:gap_max] + mask = mask_gap_max + + @na_seq *= mask + @na_qual *= mask if @has_qual + end + + if @options[:residue_min] + mask = mask_residue_min + + @na_seq *= mask + @na_qual *= mask if @has_qual + end end - def qual_mean - freq = @freq[true, [0 ... ALPH_DNA.size]] - @qual / (freq + freq.eq(0)) + # Mask that indicates which columns have more than sequence_min sequences. + # Columns with less than sequence_min are 0'ed, otherwise set to 1. + def mask_sequence_min + mask = NArray.byte(@cols, @rows) + 1 + mask *= ((@na_seq > 0).to_type("int").sum(1) >= @options[:sequence_min]) + mask end - def freq_percent - (@freq / freq_total.to_type("float") * 100).round + # Mask that indicates which residue frequencies that are above the residue_min. + # The residue frequencies are calculated for each column and residue type as the + # number of each residue type over the sum of all non-gap residues in that column. + # Positions with residues above the residue_min are indicated with 1. + def mask_residue_min + cons_min = @options[:residue_min] + factor = 1 / @na_seq.ne(0).to_type("float").sum(1) + + mask_A = (@na_seq & BIT_A > 0).to_type("int") + mask_T = (@na_seq & BIT_T > 0).to_type("int") + mask_C = (@na_seq & BIT_C > 0).to_type("int") + mask_G = (@na_seq & BIT_G > 0).to_type("int") + + mask_A = (mask_A * mask_A.sum(1)) * factor >= cons_min + mask_T = (mask_T * mask_T.sum(1)) * factor >= cons_min + mask_C = (mask_C * mask_C.sum(1)) * factor >= cons_min + mask_G = (mask_G * mask_G.sum(1)) * factor >= cons_min + + mask_A | mask_T | mask_C | mask_G end - # Method to calculate a consensus sequence from a Consensus object. - def consensus_seq - fp = freq_percent > CONSENSUS - cons = NArray.byte(@length) - cons |= fp[true, ROW_A] *= BIT_A - cons |= fp[true, ROW_T] *= BIT_T - cons |= fp[true, ROW_C] *= BIT_C - cons |= fp[true, ROW_G] *= BIT_G + # Mask that indicates which columns contain fewer gaps than max_gap. + # Columns with more gaps are 0'ed, others are set to 1. + def mask_gap_max + mask = NArray.byte(@cols, @rows) + 1 + mask *= @na_seq.ne(0).to_type("float").sum(1) / @rows > @options[:gap_max] - cons *= @count > @count.max * INDEL_RATIO + mask + end - cons.to_s.tr!(TR_HEX, TR_NUC).upcase + # Mask that indicates which residues in an alignment are above quality_min. + # Positions with subquality are 0'ed - all others are set to 1. + def mask_quality_min + @na_qual > @options[:quality_min] end - def consensus_qual - q = (@qual / (@count + @count.eq(0))).max(1) + # Mask that indicates which columns have a quality mean above quality_mean which + # is the mean of all non-gap quality residues in that column. Columns with less then + # quality_mean are 0'ed, otherwise set to 1. + def mask_quality_mean + mask = NArray.byte(@cols, @rows) + 1 + residues = @na_seq.ne(0).to_type("int").sum(1) + quality = @na_qual.to_type("float").sum(1) - (q.to_type("byte") + QUAL_BASE).to_s + mask * (quality / residues).round > @options[:quality_mean] end - def qual_filter - filter = qual_mean > QUAL_MIN - - # pp @freq - # pp @qual + # Method to calculate a consensus sequence from a Consensus object. + def consensus_seq + cons = NArray.byte(@cols) + cons |= (@na_seq & BIT_A).max(1) + cons |= (@na_seq & BIT_T).max(1) + cons |= (@na_seq & BIT_C).max(1) + cons |= (@na_seq & BIT_G).max(1) - # @freq *= filter - # @qual *= filter + cons.to_s.tr!(TR_HEX, TR_NUC).upcase + end - # pp @freq - # pp @qual + # Method to calculate a consensus quality from a Consensus object. + def consensus_qual + (@na_qual.mean(1).round.to_type("byte") + Seq::SCORE_BASE).to_s end end end - - __END__