1 # Copyright (C) 2007-2011 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
28 require 'maasha/align/pair'
29 require 'maasha/fasta'
31 class AlignError < StandardError; end;
33 ALPH_DNA = %w{A T C G}
34 ALPH_AMBI = %w{A T C G M R W S Y K V H D B N}
48 BIT_V = BIT_A | BIT_C | BIT_G
49 BIT_H = BIT_A | BIT_C | BIT_T
50 BIT_D = BIT_A | BIT_G | BIT_T
51 BIT_B = BIT_C | BIT_G | BIT_T
52 BIT_N = BIT_G | BIT_A | BIT_T | BIT_C
72 TR_NUC = "-" + ALPH_AMBI.join("").downcase
73 TR_HEX = [BIT_INDEL].pack("C") + BITMAP.pack("C*")
90 # Class for aligning sequences.
92 attr_accessor :options
97 # Class method to align sequences in a list of Seq objects and
98 # return these as a new list of Seq objects.
99 def self.muscle(entries, verbose = false)
103 Open3.popen3("muscle", "-quiet") do |stdin, stdout, stderr|
104 entries.each do |entry|
105 raise AlignError, "Duplicate sequence name: #{entry.seq_name}" if index[entry.seq_name]
107 index[entry.seq_name] = entry.dup
109 stdin.puts entry.to_fasta
114 stderr.each_line { |line| $stderr.puts line } if verbose
116 aligned_entries = Fasta.new(stdout)
118 aligned_entries.each do |fa_entry|
119 fq_entry = index[fa_entry.seq_name]
121 fa_entry.seq.scan(/-+/) do |m|
122 fq_entry.seq = fq_entry.seq[0 ... $`.length] + ('-' * m.length) + fq_entry.seq[$`.length .. -1]
123 fq_entry.qual = fq_entry.qual[0 ... $`.length] + ('@' * m.length) + fq_entry.qual[$`.length .. -1] unless fq_entry.qual.nil?
133 # Class method to create a pairwise alignment of two given Seq objects. The
134 # alignment is created by casting a search space the size of the sequences
135 # and save the best scoring match between the sequences and recurse into
136 # the left and right search spaced around this match. When all search spaces
137 # are exhausted the saved matches are used to insert gaps in the sequences.
138 def self.pair(q_entry, s_entry)
139 AlignPair.align(q_entry, s_entry)
141 self.new([q_entry, s_entry])
144 # Method to initialize an Align object with a list of aligned Seq objects.
145 def initialize(entries, options = {})
150 # Method that returns the length of the alignment.
152 @entries.first.length
155 # Method that returns the number of members or sequences in the alignment.
160 # Method that returns the identity of an alignment with two members.
163 raise AlignError "Bad number of members for similarity calculation: #{self.members}"
166 na1 = NArray.to_na(@entries[0].seq.upcase, "byte")
167 na2 = NArray.to_na(@entries[1].seq.upcase, "byte")
169 shared = (na1 - na2).count_false
170 total = (@entries[0].length < @entries[1].length) ? @entries[0].length : @entries[1].length
171 identity = shared.to_f / total
176 # Method to create a consensus sequence from an Align object and
177 # return a new Seq object with the consensus.
179 cons = Consensus.new(@entries, @options)
183 # Method to pretty print an alignment from an Align object.
185 cons = Consensus.new(@entries, @options)
188 max_name = @entries.group_by { |entry| entry.seq_name.length }.max.first
192 @entries.each do |entry|
193 output << entry.seq_name + (" " * (max_name + 3 - entry.seq_name.length )) + entry.seq + $/
196 cons_entry = cons.consensus
198 output << " " * (max_name + 3) + cons_entry.seq
199 output << $/ + " " * (max_name + 3) + cons_entry.qual.tr("[@-h]", " ..........ooooooooooOOOOOOOOOO") unless cons_entry.qual.nil?
203 # Method for iterating each of the aligned sequences.
206 @entries.each { |entry| yield entry }
215 # Method to initialize a Consensus object given a list of aligned Seq object.
216 def initialize(entries, options)
220 @cols = entries.first.seq.length
223 @has_qual = entries.first.qual.nil? ? false : true
225 @na_seq = NArray.byte(@cols, @rows)
226 @na_qual = NArray.byte(@cols, @rows) if @has_qual
232 # Method that lowercase residues that have been removed during
233 # the determination of the consensus sequence.
235 na_seq = NArray.byte(@cols, @rows)
237 @entries.each_with_index do |entry, i|
238 na_seq[true, i] = NArray.to_na(entry.seq.upcase, "byte")
241 na_seq += ((na_seq.ne('-'.ord) - @na_seq.ne(0)) * ' '.ord)
243 @entries.each_with_index do |entry, i|
244 entry.seq = na_seq[true, i].to_s
248 # Method that returns a Sequence object with a consensus sequence
249 # for the entries in an Align object.
252 new_seq.seq = consensus_seq
253 new_seq.qual = consensus_qual if @has_qual
261 # Method to add a Seq entry object to the two NArrays; @na_seq and @na_qual
263 @entries.each_with_index do |entry, i|
264 @na_seq[true, i] = NArray.to_na(entry.seq.downcase.tr(TR_NUC, TR_HEX), "byte")
265 @na_qual[true, i] = NArray.to_na(entry.qual, "byte") - Seq::SCORE_BASE if @has_qual
269 # Method to calculate a consensus sequence from a list of sequenced stored in two
273 if @options[:quality_min]
274 mask = mask_quality_min
280 if @options[:quality_mean]
281 mask = mask_quality_mean
288 if @options[:sequence_min]
289 mask = mask_sequence_min
292 @na_qual *= mask if @has_qual
295 if @options[:gap_max]
299 @na_qual *= mask if @has_qual
302 if @options[:residue_min]
303 mask = mask_residue_min
306 @na_qual *= mask if @has_qual
310 # Mask that indicates which columns have more than sequence_min sequences.
311 # Columns with less than sequence_min are 0'ed, otherwise set to 1.
312 def mask_sequence_min
313 mask = NArray.byte(@cols, @rows) + 1
314 mask *= ((@na_seq > 0).to_type("int").sum(1) >= @options[:sequence_min])
318 # Mask that indicates which residue frequencies that are above the residue_min.
319 # The residue frequencies are calculated for each column and residue type as the
320 # number of each residue type over the sum of all non-gap residues in that column.
321 # Positions with residues above the residue_min are indicated with 1.
323 cons_min = @options[:residue_min]
324 factor = 1 / @na_seq.ne(0).to_type("float").sum(1)
326 mask_A = (@na_seq & BIT_A > 0).to_type("int")
327 mask_T = (@na_seq & BIT_T > 0).to_type("int")
328 mask_C = (@na_seq & BIT_C > 0).to_type("int")
329 mask_G = (@na_seq & BIT_G > 0).to_type("int")
331 mask_A = (mask_A * mask_A.sum(1)) * factor >= cons_min
332 mask_T = (mask_T * mask_T.sum(1)) * factor >= cons_min
333 mask_C = (mask_C * mask_C.sum(1)) * factor >= cons_min
334 mask_G = (mask_G * mask_G.sum(1)) * factor >= cons_min
336 mask_A | mask_T | mask_C | mask_G
339 # Mask that indicates which columns contain fewer gaps than max_gap.
340 # Columns with more gaps are 0'ed, others are set to 1.
342 mask = NArray.byte(@cols, @rows) + 1
343 mask *= @na_seq.ne(0).to_type("float").sum(1) / @rows > @options[:gap_max]
348 # Mask that indicates which residues in an alignment are above quality_min.
349 # Positions with subquality are 0'ed - all others are set to 1.
351 @na_qual > @options[:quality_min]
354 # Mask that indicates which columns have a quality mean above quality_mean which
355 # is the mean of all non-gap quality residues in that column. Columns with less then
356 # quality_mean are 0'ed, otherwise set to 1.
357 def mask_quality_mean
358 mask = NArray.byte(@cols, @rows) + 1
359 residues = @na_seq.ne(0).to_type("int").sum(1)
360 quality = @na_qual.to_type("float").sum(1)
362 mask * (quality / residues).round > @options[:quality_mean]
365 # Method to calculate a consensus sequence from a Consensus object.
367 cons = NArray.byte(@cols)
368 cons |= (@na_seq & BIT_A).max(1)
369 cons |= (@na_seq & BIT_T).max(1)
370 cons |= (@na_seq & BIT_C).max(1)
371 cons |= (@na_seq & BIT_G).max(1)
373 cons.to_s.tr!(TR_HEX, TR_NUC).upcase
376 # Method to calculate a consensus quality from a Consensus object.
378 (@na_qual.mean(1).round.to_type("byte") + Seq::SCORE_BASE).to_s