class AlignError < StandardError; end;
-CONS_MIN = 0.20
-FREQ_MIN = 2
-QUAL_MIN = 20
ALPH_DNA = %w{A T C G}
ALPH_AMBI = %w{A T C G M R W S Y K V H D B N}
# return a new Seq object with the consensus.
def consensus
cons = Consensus.new(@entries, @options)
- cons.to_seq
+ cons.consensus
end
# Method to pretty print an alignment from an Align object.
def to_s
- qual_min = @options[:quality_min] || QUAL_MIN
+ cons = Consensus.new(@entries, @options)
+
+ entries = cons.each
+ cons_entry = cons.consensus
- cons = self.consensus
- cons.mask_seq_soft!(qual_min) unless cons.qual.nil?
+ max_name = entries.group_by { |entry| entry.seq_name.length }.max.first
- max_name = @entries.group_by { |entry| entry.seq_name.length }.max.first
+ output = ""
- @entries.each do |entry|
- entry.mask_seq_soft!(qual_min) unless entry.qual.nil?
- puts entry.seq_name + (" " * (max_name + 3 - entry.seq_name.length )) + entry.seq
+ entries.each do |entry|
+ output << entry.seq_name + (" " * (max_name + 3 - entry.seq_name.length )) + entry.seq + $/
end
- output = ""
- output << " " * (max_name + 3) + cons.seq + $/
- output << " " * (max_name + 3) + cons.qual.tr("[@-h]", " ..........ooooooooooOOOOOOOOOO") unless cons.qual.nil?
+ output << " " * (max_name + 3) + cons_entry.seq + $/
+ output << " " * (max_name + 3) + cons_entry.qual.tr("[@-h]", " ..........ooooooooooOOOOOOOOOO") unless cons_entry.qual.nil?
output
end
@entries = entries
@options = options
- @cons_min = options[:consensus_min] || CONS_MIN
- @freq_min = options[:frequency_min] || FREQ_MIN
- @qual_min = options[:quality_min] || QUAL_MIN
-
- @cols = entries.first.length
+ @cols = entries.first.seq.length
@rows = entries.size
@has_qual = entries.first.qual.nil? ? false : true
- @na_seq = NArray.byte(@cols, @rows)
- @na_qual = NArray.byte(@cols, @rows) if @has_qual
+ @na_seq = NArray.byte(entries.first.length, entries.size)
+ @na_qual = NArray.byte(entries.first.length, entries.size) if @has_qual
na_add_entries
if @has_qual
- m1 = mask_high_qual
- m2 = mask_conserved_columns
- m3 = mask_conserved_residues
+ if @options.has_key? :quality_min
+ mask = mask_quality_min
+
+ @na_seq *= mask
+ @na_qual *= mask
+ end
+
+ if @options.has_key? :quality_mean
+ mask = mask_quality_mean
+
+ @na_seq *= mask
+ @na_qual *= mask
+ end
+ end
+
+ if @options.has_key? :sequence_min
+ mask = mask_sequence_min
+
+ @na_seq *= mask
+ @na_qual *= mask if @has_qual
+ end
+
+ if @options.has_key? :gap_max
+ mask = mask_gap_max
+
+ @na_seq *= mask
+ @na_qual *= mask if @has_qual
+ end
- @na_seq *= m3
- @na_qual *= m3
+ if @options.has_key? :residue_min
+ mask = mask_residue_min
- @na_seq *= (m1 | m2)
- @na_qual *= (m1 | m2)
+ @na_seq *= mask
+ @na_qual *= mask if @has_qual
end
end
- def to_seq
+ # Method for iterating over the rows of an Align object and
+ # return Seq objects for each sequence.
+ def each
+ entries = []
+
+ (0 ... @rows).each do |i|
+ entry = Seq.new
+ entry.seq_name = @entries[i].seq_name
+ entry.seq = @na_seq[true, i].to_s.tr!(TR_HEX, TR_NUC).upcase
+ entry.qual = (@na_qual[true, i].to_type("byte") + SCORE_ILLUMINA).to_s if @has_qual
+ entry.type = @entries[i].type
+
+ if block_given?
+ yield entry
+ else
+ entries << entry
+ end
+ end
+
+ return entries unless block_given?
+ end
+
+ # Method that returns a Sequence object with a consensus sequence
+ # for the entries in an Align object.
+ def consensus
new_seq = Seq.new
new_seq.seq = consensus_seq
new_seq.qual = consensus_qual if @has_qual
end
end
- def mask_high_qual
- @na_qual > @qual_min
+ # Mask that indicates which columns have more than sequence_min sequences.
+ # Columns with less than sequence_min are 0'ed, otherwise set to 1.
+ def mask_sequence_min
+ mask = NArray.byte(@cols, @rows) + 1
+ mask *= ((@na_seq > 0).sum(1) >= @options[:sequence_min])
+ mask
end
- def mask_conserved_columns
- @na_seq * (@na_seq - @na_seq[true, 0]).sum(1).eq(0) > 0
- end
+ # Mask that indicates which residue frequencies that are above the residue_min.
+ # The residue frequencies are calculated for each column and residue type as the
+ # number of each residue type over the sum of all non-gap residues in that column.
+ # Positions with residues above the residue_min are indicated with 1.
+ def mask_residue_min
+ cons_min = @options[:residue_min]
+ factor = 1 / @na_seq.ne(0).sum(1).to_type("float")
- def mask_conserved_residues
- factor = (1 / @rows.to_f)
mask_A = @na_seq & BIT_A > 0
mask_T = @na_seq & BIT_T > 0
mask_C = @na_seq & BIT_C > 0
mask_G = @na_seq & BIT_G > 0
- mask_A = (mask_A * mask_A.sum(1)).to_f * factor > @cons_min
- mask_T = (mask_T * mask_T.sum(1)).to_f * factor > @cons_min
- mask_C = (mask_C * mask_C.sum(1)).to_f * factor > @cons_min
- mask_G = (mask_G * mask_G.sum(1)).to_f * factor > @cons_min
+ mask_A = (mask_A * mask_A.sum(1)) * factor >= cons_min
+ mask_T = (mask_T * mask_T.sum(1)) * factor >= cons_min
+ mask_C = (mask_C * mask_C.sum(1)) * factor >= cons_min
+ mask_G = (mask_G * mask_G.sum(1)) * factor >= cons_min
mask_A | mask_T | mask_C | mask_G
end
+ # Mask that indicates which columns contain fewer gaps than max_gap.
+ # Columns with more gaps are 0'ed, others are set to 1.
+ def mask_gap_max
+ mask = NArray.byte(@cols, @rows) + 1
+ mask *= @na_seq.ne(0).sum(1).to_type("float") / @rows > @options[:gap_max]
+
+ mask
+ end
+
+ # Mask that indicates which residues in an alignment are above quality_min.
+ # Positions with subquality are 0'ed - all others are set to 1.
+ def mask_quality_min
+ @na_qual > @options[:quality_min]
+ end
+
+ # Mask that indicates which columns have a quality mean above quality_mean which
+ # is the mean of all non-gap quality residues in that column. Columns with less then
+ # quality_mean are 0'ed, otherwise set to 1.
+ def mask_quality_mean
+ mask = NArray.byte(@cols, @rows) + 1
+ residues = @na_seq.ne(0).to_type("int").sum(1)
+ quality = @na_qual.to_type("float").sum(1)
+
+ mask * (quality / residues).round > @options[:quality_mean]
+ end
+
# Method to calculate a consensus sequence from a Consensus object.
def consensus_seq
cons = NArray.byte(@cols)
end
end
-
-
__END__
-
-cons |= ((@na_seq & BIT_A > 0).sum(1).to_type("float") * (1 / @rows.to_f) > CONSENSUS) * BIT_A
-cons |= ((@na_seq & BIT_T > 0).sum(1).to_type("float") * (1 / @rows.to_f) > CONSENSUS) * BIT_T
-cons |= ((@na_seq & BIT_C > 0).sum(1).to_type("float") * (1 / @rows.to_f) > CONSENSUS) * BIT_C
-cons |= ((@na_seq & BIT_G > 0).sum(1).to_type("float") * (1 / @rows.to_f) > CONSENSUS) * BIT_G