1 # Copyright (C) 2007-2011 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
28 require 'maasha/fasta'
30 class AlignError < StandardError; end;
35 ALPH_DNA = %w{A T C G}
36 ALPH_AMBI = %w{A T C G M R W S Y K V H D B N}
50 BIT_V = BIT_A | BIT_C | BIT_G
51 BIT_H = BIT_A | BIT_C | BIT_T
52 BIT_D = BIT_A | BIT_G | BIT_T
53 BIT_B = BIT_C | BIT_G | BIT_T
54 BIT_N = BIT_G | BIT_A | BIT_T | BIT_C
74 TR_NUC = "-" + ALPH_AMBI.join("").downcase
75 TR_HEX = [BIT_INDEL].pack("C") + BITMAP.pack("C*")
92 # Class for aligning sequences.
96 # Class method to align sequences in a list of Seq objects and
97 # return these as a new list of Seq objects.
98 def self.muscle(entries, verbose = false)
102 Open3.popen3("muscle", "-quiet") do |stdin, stdout, stderr|
103 entries.each do |entry|
104 raise AlignError, "Duplicate sequence name: #{entry.seq_name}" if index.has_key? entry.seq_name
106 index[entry.seq_name] = entry
108 stdin.puts entry.to_fasta
113 stderr.each_line { |line| $stderr.puts line } if verbose
115 aligned_entries = Fasta.new(stdout)
117 aligned_entries.each do |fa_entry|
118 fq_entry = index[fa_entry.seq_name]
120 fa_entry.seq.scan(/-+/) do |m|
121 fq_entry.seq = fq_entry.seq[0 ... $`.length] + ('-' * m.length) + fq_entry.seq[$`.length .. -1]
122 fq_entry.qual = fq_entry.qual[0 ... $`.length] + ('@' * m.length) + fq_entry.qual[$`.length .. -1] unless fq_entry.qual.nil?
132 # Method to initialize an Align object with a list of aligned Seq objects.
133 def initialize(entries)
137 # Method that returns the length of the alignment.
139 @entries.first.length
142 # Method that returns the number of members or sequences in the alignment.
147 # Method to create a consensus sequence from an Align object and
148 # return a new Seq object with the consensus.
150 cons = Consensus.new(@entries)
154 # Method to pretty print an alignment from an Align object.
156 cons = self.consensus
157 cons.mask_seq_soft!(QUAL_MIN) unless cons.qual.nil?
159 max_name = @entries.group_by { |entry| entry.seq_name.length }.max.first
161 @entries.each do |entry|
162 entry.mask_seq_soft!(QUAL_MIN) unless entry.qual.nil?
163 puts entry.seq_name + (" " * (max_name + 3 - entry.seq_name.length )) + entry.seq
167 output << " " * (max_name + 3) + cons.seq + $/
168 output << " " * (max_name + 3) + cons.qual.tr("[@-h]", " ..........ooooooooooOOOOOOOOOO") unless cons.qual.nil?
175 # Method to initialize a Consensus object given a list of aligned Seq object.
176 def initialize(entries)
178 @cols = entries.first.length
181 @has_qual = entries.first.qual.nil? ? false : true
183 @na_seq = NArray.byte(@cols, @rows)
184 @na_qual = NArray.byte(@cols, @rows) if @has_qual
190 m2 = mask_conserved_columns
191 m3 = mask_conserved_residues
197 @na_qual *= (m1 | m2)
203 new_seq.seq = consensus_seq
204 new_seq.qual = consensus_qual if @has_qual
213 @entries.each_with_index do |entry, i|
214 @na_seq[true, i] = NArray.to_na(entry.seq.downcase.tr(TR_NUC, TR_HEX), "byte")
215 @na_qual[true, i] = NArray.to_na(entry.qual, "byte") - SCORE_ILLUMINA if @has_qual
223 def mask_conserved_columns
224 @na_seq * (@na_seq - @na_seq[true, 0]).sum(1).eq(0) > 0
227 def mask_conserved_residues
228 factor = (1 / @rows.to_f)
229 mask_A = @na_seq & BIT_A > 0
230 mask_T = @na_seq & BIT_T > 0
231 mask_C = @na_seq & BIT_C > 0
232 mask_G = @na_seq & BIT_G > 0
234 mask_A = (mask_A * mask_A.sum(1)).to_f * factor > CONSENSUS
235 mask_T = (mask_T * mask_T.sum(1)).to_f * factor > CONSENSUS
236 mask_C = (mask_C * mask_C.sum(1)).to_f * factor > CONSENSUS
237 mask_G = (mask_G * mask_G.sum(1)).to_f * factor > CONSENSUS
239 mask_A | mask_T | mask_C | mask_G
242 # Method to calculate a consensus sequence from a Consensus object.
244 cons = NArray.byte(@cols)
245 cons |= (@na_seq & BIT_A).max(1)
246 cons |= (@na_seq & BIT_T).max(1)
247 cons |= (@na_seq & BIT_C).max(1)
248 cons |= (@na_seq & BIT_G).max(1)
250 cons.to_s.tr!(TR_HEX, TR_NUC).upcase
253 # Method to calculate a consensus quality from a Consensus object.
255 (@na_qual.mean(1).round.to_type("byte") + SCORE_ILLUMINA).to_s
264 cons |= ((@na_seq & BIT_A > 0).sum(1).to_type("float") * (1 / @rows.to_f) > CONSENSUS) * BIT_A
265 cons |= ((@na_seq & BIT_T > 0).sum(1).to_type("float") * (1 / @rows.to_f) > CONSENSUS) * BIT_T
266 cons |= ((@na_seq & BIT_C > 0).sum(1).to_type("float") * (1 / @rows.to_f) > CONSENSUS) * BIT_C
267 cons |= ((@na_seq & BIT_G > 0).sum(1).to_type("float") * (1 / @rows.to_f) > CONSENSUS) * BIT_G