1 # Copyright (C) 2007-2011 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
28 require 'maasha/fasta'
30 class AlignError < StandardError; end;
35 ALPH_DNA = %w{A T C G}
36 ALPH_AMBI = %w{A T C G M R W S Y K V H D B N}
50 BIT_V = BIT_A | BIT_C | BIT_G
51 BIT_H = BIT_A | BIT_C | BIT_T
52 BIT_D = BIT_A | BIT_G | BIT_T
53 BIT_B = BIT_C | BIT_G | BIT_T
54 BIT_N = BIT_G | BIT_A | BIT_T | BIT_C
74 TR_NUC = "-" + ALPH_AMBI.join("").downcase
75 TR_HEX = [BIT_INDEL].pack("C") + BITMAP.pack("C*")
92 # Class for aligning sequences.
94 attr_accessor :options
97 # Class method to align sequences in a list of Seq objects and
98 # return these as a new list of Seq objects.
99 def self.muscle(entries, verbose = false)
103 Open3.popen3("muscle", "-quiet") do |stdin, stdout, stderr|
104 entries.each do |entry|
105 raise AlignError, "Duplicate sequence name: #{entry.seq_name}" if index.has_key? entry.seq_name
107 index[entry.seq_name] = entry
109 stdin.puts entry.to_fasta
114 stderr.each_line { |line| $stderr.puts line } if verbose
116 aligned_entries = Fasta.new(stdout)
118 aligned_entries.each do |fa_entry|
119 fq_entry = index[fa_entry.seq_name]
121 fa_entry.seq.scan(/-+/) do |m|
122 fq_entry.seq = fq_entry.seq[0 ... $`.length] + ('-' * m.length) + fq_entry.seq[$`.length .. -1]
123 fq_entry.qual = fq_entry.qual[0 ... $`.length] + ('@' * m.length) + fq_entry.qual[$`.length .. -1] unless fq_entry.qual.nil?
133 # Method to initialize an Align object with a list of aligned Seq objects.
134 def initialize(entries, options = {})
139 # Method that returns the length of the alignment.
141 @entries.first.length
144 # Method that returns the number of members or sequences in the alignment.
149 # Method to create a consensus sequence from an Align object and
150 # return a new Seq object with the consensus.
152 cons = Consensus.new(@entries, @options)
156 # Method to pretty print an alignment from an Align object.
158 qual_min = @options[:quality_min] || QUAL_MIN
160 cons = self.consensus
161 cons.mask_seq_soft!(qual_min) unless cons.qual.nil?
163 max_name = @entries.group_by { |entry| entry.seq_name.length }.max.first
165 @entries.each do |entry|
166 entry.mask_seq_soft!(qual_min) unless entry.qual.nil?
167 puts entry.seq_name + (" " * (max_name + 3 - entry.seq_name.length )) + entry.seq
171 output << " " * (max_name + 3) + cons.seq + $/
172 output << " " * (max_name + 3) + cons.qual.tr("[@-h]", " ..........ooooooooooOOOOOOOOOO") unless cons.qual.nil?
179 # Method to initialize a Consensus object given a list of aligned Seq object.
180 def initialize(entries, options)
184 @cons_min = options[:consensus_min] || CONS_MIN
185 @freq_min = options[:frequency_min] || FREQ_MIN
186 @qual_min = options[:quality_min] || QUAL_MIN
188 @cols = entries.first.length
191 @has_qual = entries.first.qual.nil? ? false : true
193 @na_seq = NArray.byte(@cols, @rows)
194 @na_qual = NArray.byte(@cols, @rows) if @has_qual
200 m2 = mask_conserved_columns
201 m3 = mask_conserved_residues
207 @na_qual *= (m1 | m2)
213 new_seq.seq = consensus_seq
214 new_seq.qual = consensus_qual if @has_qual
223 @entries.each_with_index do |entry, i|
224 @na_seq[true, i] = NArray.to_na(entry.seq.downcase.tr(TR_NUC, TR_HEX), "byte")
225 @na_qual[true, i] = NArray.to_na(entry.qual, "byte") - SCORE_ILLUMINA if @has_qual
233 def mask_conserved_columns
234 @na_seq * (@na_seq - @na_seq[true, 0]).sum(1).eq(0) > 0
237 def mask_conserved_residues
238 factor = (1 / @rows.to_f)
239 mask_A = @na_seq & BIT_A > 0
240 mask_T = @na_seq & BIT_T > 0
241 mask_C = @na_seq & BIT_C > 0
242 mask_G = @na_seq & BIT_G > 0
244 mask_A = (mask_A * mask_A.sum(1)).to_f * factor > @cons_min
245 mask_T = (mask_T * mask_T.sum(1)).to_f * factor > @cons_min
246 mask_C = (mask_C * mask_C.sum(1)).to_f * factor > @cons_min
247 mask_G = (mask_G * mask_G.sum(1)).to_f * factor > @cons_min
249 mask_A | mask_T | mask_C | mask_G
252 # Method to calculate a consensus sequence from a Consensus object.
254 cons = NArray.byte(@cols)
255 cons |= (@na_seq & BIT_A).max(1)
256 cons |= (@na_seq & BIT_T).max(1)
257 cons |= (@na_seq & BIT_C).max(1)
258 cons |= (@na_seq & BIT_G).max(1)
260 cons.to_s.tr!(TR_HEX, TR_NUC).upcase
263 # Method to calculate a consensus quality from a Consensus object.
265 (@na_qual.mean(1).round.to_type("byte") + SCORE_ILLUMINA).to_s
274 cons |= ((@na_seq & BIT_A > 0).sum(1).to_type("float") * (1 / @rows.to_f) > CONSENSUS) * BIT_A
275 cons |= ((@na_seq & BIT_T > 0).sum(1).to_type("float") * (1 / @rows.to_f) > CONSENSUS) * BIT_T
276 cons |= ((@na_seq & BIT_C > 0).sum(1).to_type("float") * (1 / @rows.to_f) > CONSENSUS) * BIT_C
277 cons |= ((@na_seq & BIT_G > 0).sum(1).to_type("float") * (1 / @rows.to_f) > CONSENSUS) * BIT_G