require 'pp'
require 'open3'
require 'narray'
+require 'maasha/align/pair'
require 'maasha/fasta'
class AlignError < StandardError; end;
-CONSENSUS = 20
-INDEL_RATIO = 0.5
-QUAL_MIN = 15
-QUAL_BASE = 64
-ALPH_DNA = %w{A T C G}
-ALPH_AMBI = %w{A T C G M R W S Y K V H D B N}
+ALPH_DNA = %w{A T C G}
+ALPH_AMBI = %w{A T C G M R W S Y K V H D B N}
BIT_INDEL = 0
BIT_A = 1 << 0
BIT_T = 1 << 1
BIT_C = 1 << 2
BIT_G = 1 << 3
+
BIT_M = BIT_A | BIT_C
BIT_R = BIT_A | BIT_G
BIT_W = BIT_A | BIT_T
# Class for aligning sequences.
class Align
+ attr_accessor :options
+ attr_reader :entries
+
+ include PairAlign
+
# Class method to align sequences in a list of Seq objects and
# return these as a new list of Seq objects.
def self.muscle(entries, verbose = false)
Open3.popen3("muscle", "-quiet") do |stdin, stdout, stderr|
entries.each do |entry|
- raise AlignError, "Duplicate sequence name: #{entry.seq_name}" if index.has_key? entry.seq_name
+ raise AlignError, "Duplicate sequence name: #{entry.seq_name}" if index[entry.seq_name]
- index[entry.seq_name] = entry
+ index[entry.seq_name] = entry.dup
stdin.puts entry.to_fasta
end
end
end
- result
+ self.new(result)
+ end
+
+ # Class method to create a pairwise alignment of two given Seq objects. The
+ # alignment is created by casting a search space the size of the sequences
+ # and save the best scoring match between the sequences and recurse into
+ # the left and right search spaced around this match. When all search spaces
+ # are exhausted the saved matches are used to insert gaps in the sequences.
+ def self.pair(q_entry, s_entry)
+ AlignPair.align(q_entry, s_entry)
+
+ self.new([q_entry, s_entry])
end
- # Class method to create a consensus sequence from a list of Seq objects and
+ # Method to initialize an Align object with a list of aligned Seq objects.
+ def initialize(entries, options = {})
+ @entries = entries
+ @options = options
+ end
+
+ # Method that returns the length of the alignment.
+ def length
+ @entries.first.length
+ end
+
+ # Method that returns the number of members or sequences in the alignment.
+ def members
+ @entries.size
+ end
+
+ # Method that returns the identity of an alignment with two members.
+ def identity
+ if self.members != 2
+ raise AlignError "Bad number of members for similarity calculation: #{self.members}"
+ end
+
+ na1 = NArray.to_na(@entries[0].seq.upcase, "byte")
+ na2 = NArray.to_na(@entries[1].seq.upcase, "byte")
+
+ shared = (na1 - na2).count_false
+ total = (@entries[0].length < @entries[1].length) ? @entries[0].length : @entries[1].length
+ identity = shared.to_f / total
+
+ identity
+ end
+
+ # Method to create a consensus sequence from an Align object and
# return a new Seq object with the consensus.
- def self.consensus(entries)
- cons = Consensus.new(entries.first.length)
+ def consensus
+ cons = Consensus.new(@entries, @options)
+ cons.consensus
+ end
- entries.each { |entry| cons.add(entry) }
+ # Method to pretty print an alignment from an Align object.
+ def to_s
+ cons = Consensus.new(@entries, @options)
+ cons.mask_entries!
- cons.to_seq
+ max_name = @entries.group_by { |entry| entry.seq_name.length }.max.first
+
+ output = ""
+
+ @entries.each do |entry|
+ output << entry.seq_name + (" " * (max_name + 3 - entry.seq_name.length )) + entry.seq + $/
+ end
+
+ cons_entry = cons.consensus
+
+ output << " " * (max_name + 3) + cons_entry.seq
+ output << $/ + " " * (max_name + 3) + cons_entry.qual.tr("[@-h]", " ..........ooooooooooOOOOOOOOOO") unless cons_entry.qual.nil?
+ output
+ end
+
+ # Method for iterating each of the aligned sequences.
+ def each
+ if block_given?
+ @entries.each { |entry| yield entry }
+ else
+ return @entries
+ end
end
+ private
+
class Consensus
- # Method to initialize a Consensus object given a Seq object, which is added
- # to the Consensus.
- def initialize(length)
- @length = length
-
- @count = NArray.int(@length)
- @freq = NArray.int(@length, ALPH_DNA.size)
- @qual = NArray.int(@length, ALPH_DNA.size)
- @mask_A = NArray.byte(@length).fill!(BIT_A)
- @mask_T = NArray.byte(@length).fill!(BIT_T)
- @mask_C = NArray.byte(@length).fill!(BIT_C)
- @mask_G = NArray.byte(@length).fill!(BIT_G)
- @has_qual = false
- end
-
- # Method to add a Seq entry to a Consensus object.
- def add(entry)
- seq = NArray.to_na(entry.seq.downcase.tr(TR_NUC, TR_HEX), "byte")
-
- @count += seq > 0
-
- mask_A = (seq & @mask_A) > 0
- mask_T = (seq & @mask_T) > 0
- mask_C = (seq & @mask_C) > 0
- mask_G = (seq & @mask_G) > 0
-
- @freq[true, ROW_A] += mask_A
- @freq[true, ROW_T] += mask_T
- @freq[true, ROW_C] += mask_C
- @freq[true, ROW_G] += mask_G
-
- unless entry.qual.nil?
- @has_qual = true
-
- qual = NArray.to_na(entry.qual, "byte") - QUAL_BASE
-
- @qual[true, ROW_A] += mask_A * qual
- @qual[true, ROW_T] += mask_T * qual
- @qual[true, ROW_C] += mask_C * qual
- @qual[true, ROW_G] += mask_G * qual
+ # Method to initialize a Consensus object given a list of aligned Seq object.
+ def initialize(entries, options)
+ @entries = entries
+ @options = options
+
+ @cols = entries.first.seq.length
+ @rows = entries.size
+
+ @has_qual = entries.first.qual.nil? ? false : true
+
+ @na_seq = NArray.byte(@cols, @rows)
+ @na_qual = NArray.byte(@cols, @rows) if @has_qual
+
+ na_add_entries
+ consensus_calc
+ end
+
+ # Method that lowercase residues that have been removed during
+ # the determination of the consensus sequence.
+ def mask_entries!
+ na_seq = NArray.byte(@cols, @rows)
+
+ @entries.each_with_index do |entry, i|
+ na_seq[true, i] = NArray.to_na(entry.seq.upcase, "byte")
+ end
+
+ na_seq += ((na_seq.ne('-'.ord) - @na_seq.ne(0)) * ' '.ord)
+
+ @entries.each_with_index do |entry, i|
+ entry.seq = na_seq[true, i].to_s
end
end
- def to_seq
- #qual_filter
+ # Method that returns a Sequence object with a consensus sequence
+ # for the entries in an Align object.
+ def consensus
new_seq = Seq.new
new_seq.seq = consensus_seq
new_seq.qual = consensus_qual if @has_qual
- new_seq.type = "dna"
+ new_seq.type = :dna
new_seq
end
private
- # Method that calculates the sum for each column except the indel row and
- # returns the sum in an NArray.
- def freq_total
- @freq.sum(1)
+ # Method to add a Seq entry object to the two NArrays; @na_seq and @na_qual
+ def na_add_entries
+ @entries.each_with_index do |entry, i|
+ @na_seq[true, i] = NArray.to_na(entry.seq.downcase.tr(TR_NUC, TR_HEX), "byte")
+ @na_qual[true, i] = NArray.to_na(entry.qual, "byte") - Seq::SCORE_BASE if @has_qual
+ end
end
- # Method that locates the maximum value for each column and
- # returns this in an NArray.
- def freq_max
- @freq.max(1)
+ # Method to calculate a consensus sequence from a list of sequenced stored in two
+ # NArrays.
+ def consensus_calc
+ if @has_qual
+ if @options[:quality_min]
+ mask = mask_quality_min
+
+ @na_seq *= mask
+ @na_qual *= mask
+ end
+
+ if @options[:quality_mean]
+ mask = mask_quality_mean
+
+ @na_seq *= mask
+ @na_qual *= mask
+ end
+ end
+
+ if @options[:sequence_min]
+ mask = mask_sequence_min
+
+ @na_seq *= mask
+ @na_qual *= mask if @has_qual
+ end
+
+ if @options[:gap_max]
+ mask = mask_gap_max
+
+ @na_seq *= mask
+ @na_qual *= mask if @has_qual
+ end
+
+ if @options[:residue_min]
+ mask = mask_residue_min
+
+ @na_seq *= mask
+ @na_qual *= mask if @has_qual
+ end
end
- def qual_mean
- freq = @freq[true, [0 ... ALPH_DNA.size]]
- @qual / (freq + freq.eq(0))
+ # Mask that indicates which columns have more than sequence_min sequences.
+ # Columns with less than sequence_min are 0'ed, otherwise set to 1.
+ def mask_sequence_min
+ mask = NArray.byte(@cols, @rows) + 1
+ mask *= ((@na_seq > 0).to_type("int").sum(1) >= @options[:sequence_min])
+ mask
end
- def freq_percent
- (@freq / freq_total.to_type("float") * 100).round
+ # Mask that indicates which residue frequencies that are above the residue_min.
+ # The residue frequencies are calculated for each column and residue type as the
+ # number of each residue type over the sum of all non-gap residues in that column.
+ # Positions with residues above the residue_min are indicated with 1.
+ def mask_residue_min
+ cons_min = @options[:residue_min]
+ factor = 1 / @na_seq.ne(0).to_type("float").sum(1)
+
+ mask_A = (@na_seq & BIT_A > 0).to_type("int")
+ mask_T = (@na_seq & BIT_T > 0).to_type("int")
+ mask_C = (@na_seq & BIT_C > 0).to_type("int")
+ mask_G = (@na_seq & BIT_G > 0).to_type("int")
+
+ mask_A = (mask_A * mask_A.sum(1)) * factor >= cons_min
+ mask_T = (mask_T * mask_T.sum(1)) * factor >= cons_min
+ mask_C = (mask_C * mask_C.sum(1)) * factor >= cons_min
+ mask_G = (mask_G * mask_G.sum(1)) * factor >= cons_min
+
+ mask_A | mask_T | mask_C | mask_G
end
- # Method to calculate a consensus sequence from a Consensus object.
- def consensus_seq
- fp = freq_percent > CONSENSUS
- cons = NArray.byte(@length)
- cons |= fp[true, ROW_A] *= BIT_A
- cons |= fp[true, ROW_T] *= BIT_T
- cons |= fp[true, ROW_C] *= BIT_C
- cons |= fp[true, ROW_G] *= BIT_G
+ # Mask that indicates which columns contain fewer gaps than max_gap.
+ # Columns with more gaps are 0'ed, others are set to 1.
+ def mask_gap_max
+ mask = NArray.byte(@cols, @rows) + 1
+ mask *= @na_seq.ne(0).to_type("float").sum(1) / @rows > @options[:gap_max]
- cons *= @count > @count.max * INDEL_RATIO
+ mask
+ end
- cons.to_s.tr!(TR_HEX, TR_NUC).upcase
+ # Mask that indicates which residues in an alignment are above quality_min.
+ # Positions with subquality are 0'ed - all others are set to 1.
+ def mask_quality_min
+ @na_qual > @options[:quality_min]
end
- def consensus_qual
- q = (@qual / (@count + @count.eq(0))).max(1)
+ # Mask that indicates which columns have a quality mean above quality_mean which
+ # is the mean of all non-gap quality residues in that column. Columns with less then
+ # quality_mean are 0'ed, otherwise set to 1.
+ def mask_quality_mean
+ mask = NArray.byte(@cols, @rows) + 1
+ residues = @na_seq.ne(0).to_type("int").sum(1)
+ quality = @na_qual.to_type("float").sum(1)
- (q.to_type("byte") + QUAL_BASE).to_s
+ mask * (quality / residues).round > @options[:quality_mean]
end
- def qual_filter
- filter = qual_mean > QUAL_MIN
-
- # pp @freq
- # pp @qual
+ # Method to calculate a consensus sequence from a Consensus object.
+ def consensus_seq
+ cons = NArray.byte(@cols)
+ cons |= (@na_seq & BIT_A).max(1)
+ cons |= (@na_seq & BIT_T).max(1)
+ cons |= (@na_seq & BIT_C).max(1)
+ cons |= (@na_seq & BIT_G).max(1)
- # @freq *= filter
- # @qual *= filter
+ cons.to_s.tr!(TR_HEX, TR_NUC).upcase
+ end
- # pp @freq
- # pp @qual
+ # Method to calculate a consensus quality from a Consensus object.
+ def consensus_qual
+ (@na_qual.mean(1).round.to_type("byte") + Seq::SCORE_BASE).to_s
end
end
end
-
-
__END__