1 # Copyright (C) 2007-2011 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
28 require 'maasha/fasta'
30 class AlignError < StandardError; end;
32 ALPH_DNA = %w{A T C G}
33 ALPH_AMBI = %w{A T C G M R W S Y K V H D B N}
47 BIT_V = BIT_A | BIT_C | BIT_G
48 BIT_H = BIT_A | BIT_C | BIT_T
49 BIT_D = BIT_A | BIT_G | BIT_T
50 BIT_B = BIT_C | BIT_G | BIT_T
51 BIT_N = BIT_G | BIT_A | BIT_T | BIT_C
71 TR_NUC = "-" + ALPH_AMBI.join("").downcase
72 TR_HEX = [BIT_INDEL].pack("C") + BITMAP.pack("C*")
89 # Class for aligning sequences.
91 attr_accessor :options
94 # Class method to align sequences in a list of Seq objects and
95 # return these as a new list of Seq objects.
96 def self.muscle(entries, verbose = false)
100 Open3.popen3("muscle", "-quiet") do |stdin, stdout, stderr|
101 entries.each do |entry|
102 raise AlignError, "Duplicate sequence name: #{entry.seq_name}" if index.has_key? entry.seq_name
104 index[entry.seq_name] = entry
106 stdin.puts entry.to_fasta
111 stderr.each_line { |line| $stderr.puts line } if verbose
113 aligned_entries = Fasta.new(stdout)
115 aligned_entries.each do |fa_entry|
116 fq_entry = index[fa_entry.seq_name]
118 fa_entry.seq.scan(/-+/) do |m|
119 fq_entry.seq = fq_entry.seq[0 ... $`.length] + ('-' * m.length) + fq_entry.seq[$`.length .. -1]
120 fq_entry.qual = fq_entry.qual[0 ... $`.length] + ('@' * m.length) + fq_entry.qual[$`.length .. -1] unless fq_entry.qual.nil?
130 # Method to initialize an Align object with a list of aligned Seq objects.
131 def initialize(entries, options = {})
136 # Method that returns the length of the alignment.
138 @entries.first.length
141 # Method that returns the number of members or sequences in the alignment.
146 # Method to create a consensus sequence from an Align object and
147 # return a new Seq object with the consensus.
149 cons = Consensus.new(@entries, @options)
153 # Method to pretty print an alignment from an Align object.
155 cons = Consensus.new(@entries, @options)
158 max_name = @entries.group_by { |entry| entry.seq_name.length }.max.first
162 @entries.each do |entry|
163 output << entry.seq_name + (" " * (max_name + 3 - entry.seq_name.length )) + entry.seq + $/
166 cons_entry = cons.consensus
168 output << " " * (max_name + 3) + cons_entry.seq
169 output << $/ + " " * (max_name + 3) + cons_entry.qual.tr("[@-h]", " ..........ooooooooooOOOOOOOOOO") unless cons_entry.qual.nil?
176 # Method to initialize a Consensus object given a list of aligned Seq object.
177 def initialize(entries, options)
181 @cols = entries.first.seq.length
184 @has_qual = entries.first.qual.nil? ? false : true
186 @na_seq = NArray.byte(@cols, @rows)
187 @na_qual = NArray.byte(@cols, @rows) if @has_qual
193 # Method that lowercase residues that have been removed during
194 # the determination of the consensus sequence.
196 na_seq = NArray.byte(@cols, @rows)
198 @entries.each_with_index do |entry, i|
199 na_seq[true, i] = NArray.to_na(entry.seq.upcase, "byte")
202 na_seq += ((na_seq.ne('-'.ord) - @na_seq.ne(0)) * ' '.ord)
204 @entries.each_with_index do |entry, i|
205 entry.seq = na_seq[true, i].to_s
209 # Method that returns a Sequence object with a consensus sequence
210 # for the entries in an Align object.
213 new_seq.seq = consensus_seq
214 new_seq.qual = consensus_qual if @has_qual
222 # Method to add a Seq entry object to the two NArrays; @na_seq and @na_qual
224 @entries.each_with_index do |entry, i|
225 @na_seq[true, i] = NArray.to_na(entry.seq.downcase.tr(TR_NUC, TR_HEX), "byte")
226 @na_qual[true, i] = NArray.to_na(entry.qual, "byte") - SCORE_ILLUMINA if @has_qual
230 # Method to calculate a consensus sequence from a list of sequenced stored in two
234 if @options.has_key? :quality_min
235 mask = mask_quality_min
241 if @options.has_key? :quality_mean
242 mask = mask_quality_mean
249 if @options.has_key? :sequence_min
250 mask = mask_sequence_min
253 @na_qual *= mask if @has_qual
256 if @options.has_key? :gap_max
260 @na_qual *= mask if @has_qual
263 if @options.has_key? :residue_min
264 mask = mask_residue_min
267 @na_qual *= mask if @has_qual
271 # Mask that indicates which columns have more than sequence_min sequences.
272 # Columns with less than sequence_min are 0'ed, otherwise set to 1.
273 def mask_sequence_min
274 mask = NArray.byte(@cols, @rows) + 1
275 mask *= ((@na_seq > 0).to_type("int").sum(1) >= @options[:sequence_min])
279 # Mask that indicates which residue frequencies that are above the residue_min.
280 # The residue frequencies are calculated for each column and residue type as the
281 # number of each residue type over the sum of all non-gap residues in that column.
282 # Positions with residues above the residue_min are indicated with 1.
284 cons_min = @options[:residue_min]
285 factor = 1 / @na_seq.ne(0).to_type("float").sum(1)
287 mask_A = (@na_seq & BIT_A > 0).to_type("int")
288 mask_T = (@na_seq & BIT_T > 0).to_type("int")
289 mask_C = (@na_seq & BIT_C > 0).to_type("int")
290 mask_G = (@na_seq & BIT_G > 0).to_type("int")
292 mask_A = (mask_A * mask_A.sum(1)) * factor >= cons_min
293 mask_T = (mask_T * mask_T.sum(1)) * factor >= cons_min
294 mask_C = (mask_C * mask_C.sum(1)) * factor >= cons_min
295 mask_G = (mask_G * mask_G.sum(1)) * factor >= cons_min
297 mask_A | mask_T | mask_C | mask_G
300 # Mask that indicates which columns contain fewer gaps than max_gap.
301 # Columns with more gaps are 0'ed, others are set to 1.
303 mask = NArray.byte(@cols, @rows) + 1
304 mask *= @na_seq.ne(0).to_type("float").sum(1) / @rows > @options[:gap_max]
309 # Mask that indicates which residues in an alignment are above quality_min.
310 # Positions with subquality are 0'ed - all others are set to 1.
312 @na_qual > @options[:quality_min]
315 # Mask that indicates which columns have a quality mean above quality_mean which
316 # is the mean of all non-gap quality residues in that column. Columns with less then
317 # quality_mean are 0'ed, otherwise set to 1.
318 def mask_quality_mean
319 mask = NArray.byte(@cols, @rows) + 1
320 residues = @na_seq.ne(0).to_type("int").sum(1)
321 quality = @na_qual.to_type("float").sum(1)
323 mask * (quality / residues).round > @options[:quality_mean]
326 # Method to calculate a consensus sequence from a Consensus object.
328 cons = NArray.byte(@cols)
329 cons |= (@na_seq & BIT_A).max(1)
330 cons |= (@na_seq & BIT_T).max(1)
331 cons |= (@na_seq & BIT_C).max(1)
332 cons |= (@na_seq & BIT_G).max(1)
334 cons.to_s.tr!(TR_HEX, TR_NUC).upcase
337 # Method to calculate a consensus quality from a Consensus object.
339 (@na_qual.mean(1).round.to_type("byte") + SCORE_ILLUMINA).to_s