class AlignError < StandardError; end;
CONSENSUS = 0.20
-INDEL_RATIO = 0.5
FREQ_MIN = 2
-QUAL_BASE = 64
+QUAL_MIN = 20
ALPH_DNA = %w{A T C G}
ALPH_AMBI = %w{A T C G M R W S Y K V H D B N}
# Method to create a consensus sequence from an Align object and
# return a new Seq object with the consensus.
def consensus
- cons = Consensus.new(self.length)
-
- @entries.each { |entry| cons.add(entry) }
-
+ cons = Consensus.new(@entries)
cons.to_seq
end
# Method to pretty print an alignment from an Align object.
def to_s
cons = self.consensus
- cons.mask_seq_soft!(20) unless cons.qual.nil?
+ cons.mask_seq_soft!(QUAL_MIN) unless cons.qual.nil?
max_name = @entries.group_by { |entry| entry.seq_name.length }.max.first
@entries.each do |entry|
+ entry.mask_seq_soft!(QUAL_MIN) unless entry.qual.nil?
puts entry.seq_name + (" " * (max_name + 3 - entry.seq_name.length )) + entry.seq
end
private
class Consensus
- # Method to initialize a Consensus object given a Seq object, which is added
- # to the Consensus.
- def initialize(length)
- @length = length
-
- @freq = NArray.int(@length, ALPH_DNA.size)
- @qual = NArray.int(@length, ALPH_DNA.size)
- @mask_A = NArray.byte(@length).fill!(BIT_A)
- @mask_T = NArray.byte(@length).fill!(BIT_T)
- @mask_C = NArray.byte(@length).fill!(BIT_C)
- @mask_G = NArray.byte(@length).fill!(BIT_G)
- @has_qual = false
- @count = 0
- end
-
- # Method to add a Seq entry to a Consensus object.
- def add(entry)
- @count += 1
-
- seq = NArray.to_na(entry.seq.downcase.tr(TR_NUC, TR_HEX), "byte")
-
- mask_A = (seq & @mask_A) > 0
- mask_T = (seq & @mask_T) > 0
- mask_C = (seq & @mask_C) > 0
- mask_G = (seq & @mask_G) > 0
+ # Method to initialize a Consensus object given a list of aligned Seq object.
+ def initialize(entries)
+ @entries = entries
+ @cols = entries.first.length
+ @rows = entries.size
+ @na_seq = NArray.byte(@cols, @rows)
+ @na_qual = NArray.byte(@cols, @rows)
- @freq[true, ROW_A] += mask_A
- @freq[true, ROW_T] += mask_T
- @freq[true, ROW_C] += mask_C
- @freq[true, ROW_G] += mask_G
+ na_add_entries
- unless entry.qual.nil?
- @has_qual = true
+ m1 = mask_high_qual
+ m2 = mask_conserved_columns
+ m3 = mask_conserved_residues
- qual = NArray.to_na(entry.qual, "byte") - QUAL_BASE
+ @na_seq *= m3
+ @na_qual *= m3
- @qual[true, ROW_A] += mask_A * qual
- @qual[true, ROW_T] += mask_T * qual
- @qual[true, ROW_C] += mask_C * qual
- @qual[true, ROW_G] += mask_G * qual
- end
+ @na_seq *= (m1 | m2)
+ @na_qual *= (m1 | m2)
end
def to_seq
new_seq = Seq.new
new_seq.seq = consensus_seq
- new_seq.qual = consensus_qual if @has_qual
+ new_seq.qual = consensus_qual
new_seq.type = "dna"
new_seq
private
- # Method to calculate a consensus sequence from a Consensus object.
- def consensus_seq
- fp = (@freq.to_type("float") / @count) > CONSENSUS
+ def na_add_entries
+ @entries.each_with_index do |entry, i|
+ @na_seq[true, i] = NArray.to_na(entry.seq.downcase.tr(TR_NUC, TR_HEX), "byte")
+ @na_qual[true, i] = NArray.to_na(entry.qual, "byte") - SCORE_ILLUMINA
+ end
+ end
- fp *= @freq >= FREQ_MIN
+ def mask_high_qual
+ @na_qual > QUAL_MIN
+ end
- cons = NArray.byte(@length)
- cons |= fp[true, ROW_A] *= BIT_A
- cons |= fp[true, ROW_T] *= BIT_T
- cons |= fp[true, ROW_C] *= BIT_C
- cons |= fp[true, ROW_G] *= BIT_G
-# cons *= @freq.sum(1).to_type("float") / @count > INDEL_RATIO
+ def mask_conserved_columns
+ @na_seq * (@na_seq - @na_seq[true, 0]).sum(1).eq(0) > 0
+ end
- cons.to_s.tr!(TR_HEX, TR_NUC).upcase
+ def mask_conserved_residues
+ factor = (1 / @rows.to_f)
+ mask_A = @na_seq & BIT_A > 0
+ mask_T = @na_seq & BIT_T > 0
+ mask_C = @na_seq & BIT_C > 0
+ mask_G = @na_seq & BIT_G > 0
+
+ mask_A = (mask_A * mask_A.sum(1)).to_f * factor > CONSENSUS
+ mask_T = (mask_T * mask_T.sum(1)).to_f * factor > CONSENSUS
+ mask_C = (mask_C * mask_C.sum(1)).to_f * factor > CONSENSUS
+ mask_G = (mask_G * mask_G.sum(1)).to_f * factor > CONSENSUS
+
+ mask_A | mask_T | mask_C | mask_G
end
- def consensus_qual
- q = qual_mean.max(1)
+ # Method to calculate a consensus sequence from a Consensus object.
+ def consensus_seq
+ cons = NArray.byte(@cols)
+ cons |= (@na_seq & BIT_A).max(1)
+ cons |= (@na_seq & BIT_T).max(1)
+ cons |= (@na_seq & BIT_C).max(1)
+ cons |= (@na_seq & BIT_G).max(1)
- (q.to_type("byte") + QUAL_BASE).to_s
+ cons.to_s.tr!(TR_HEX, TR_NUC).upcase
end
- def qual_mean
- @qual / (@freq + @freq.eq(0))
+ # Method to calculate a consensus quality from a Consensus object.
+ def consensus_qual
+ (@na_qual.mean(1).round.to_type("byte") + SCORE_ILLUMINA).to_s
end
end
end
__END__
+
+cons |= ((@na_seq & BIT_A > 0).sum(1).to_type("float") * (1 / @rows.to_f) > CONSENSUS) * BIT_A
+cons |= ((@na_seq & BIT_T > 0).sum(1).to_type("float") * (1 / @rows.to_f) > CONSENSUS) * BIT_T
+cons |= ((@na_seq & BIT_C > 0).sum(1).to_type("float") * (1 / @rows.to_f) > CONSENSUS) * BIT_C
+cons |= ((@na_seq & BIT_G > 0).sum(1).to_type("float") * (1 / @rows.to_f) > CONSENSUS) * BIT_G