1 # Copyright (C) 2007-2011 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
28 require 'maasha/fasta'
30 class AlignError < StandardError; end;
36 ALPH_DNA = %w{A T C G}
37 ALPH_AMBI = %w{A T C G M R W S Y K V H D B N}
50 BIT_V = BIT_A | BIT_C | BIT_G
51 BIT_H = BIT_A | BIT_C | BIT_T
52 BIT_D = BIT_A | BIT_G | BIT_T
53 BIT_B = BIT_C | BIT_G | BIT_T
54 BIT_N = BIT_G | BIT_A | BIT_T | BIT_C
74 TR_NUC = "-" + ALPH_AMBI.join("").downcase
75 TR_HEX = [BIT_INDEL].pack("C") + BITMAP.pack("C*")
82 # Adding an initialize method to the existing Fasta class.
89 # Class for aligning sequences.
91 # Class method to align sequences in a list of Seq objects and
92 # return these as a new list of Seq objects.
93 def self.muscle(entries)
98 Open3.popen3("muscle") do |stdin, stdout, stderr|
99 entries.each do |entry|
100 index[entry.seq_name] = entry
102 stdin.puts entry.to_fasta
107 aligned_entries = Fasta.new(stdout)
109 aligned_entries.each do |fa_entry|
110 fq_entry = index[fa_entry.seq_name]
112 fa_entry.seq.scan(/-+/) do |m|
113 fq_entry.seq = fq_entry.seq[0 ... $`.length] + ('-' * m.length) + fq_entry.seq[$`.length .. -1]
114 fq_entry.qual = fq_entry.qual[0 ... $`.length] + ('@' * m.length) + fq_entry.qual[$`.length .. -1]
124 # Class method to create a consensus sequence from a list of Seq objects and
125 # return a new Seq object with the consensus.
126 def self.consensus(entries)
127 cons = Consensus.new(entries.first.length)
129 entries.each { |entry| cons.add(entry) }
135 # Method to initialize a Consensus object given a Seq object, which is added
137 def initialize(length)
140 @count = NArray.int(@length)
141 @freq = NArray.int(@length, ALPH_DNA.size)
142 @qual = NArray.int(@length, ALPH_DNA.size)
143 @mask_A = NArray.byte(@length).fill!(BIT_A)
144 @mask_T = NArray.byte(@length).fill!(BIT_T)
145 @mask_C = NArray.byte(@length).fill!(BIT_C)
146 @mask_G = NArray.byte(@length).fill!(BIT_G)
150 # Method to add a Seq entry to a Consensus object.
152 seq = NArray.to_na(entry.seq.downcase.tr(TR_NUC, TR_HEX), "byte")
156 mask_A = (seq & @mask_A) > 0
157 mask_T = (seq & @mask_T) > 0
158 mask_C = (seq & @mask_C) > 0
159 mask_G = (seq & @mask_G) > 0
161 @freq[true, ROW_A] += mask_A
162 @freq[true, ROW_T] += mask_T
163 @freq[true, ROW_C] += mask_C
164 @freq[true, ROW_G] += mask_G
166 unless entry.qual.nil?
169 qual = NArray.to_na(entry.qual, "byte") - QUAL_BASE
171 @qual[true, ROW_A] += mask_A * qual
172 @qual[true, ROW_T] += mask_T * qual
173 @qual[true, ROW_C] += mask_C * qual
174 @qual[true, ROW_G] += mask_G * qual
181 new_seq.seq = consensus_seq
182 new_seq.qual = consensus_qual if @has_qual
190 # Method that calculates the sum for each column except the indel row and
191 # returns the sum in an NArray.
196 # Method that locates the maximum value for each column and
197 # returns this in an NArray.
203 freq = @freq[true, [0 ... ALPH_DNA.size]]
204 @qual / (freq + freq.eq(0))
208 (@freq / freq_total.to_type("float") * 100).round
211 # Method to calculate a consensus sequence from a Consensus object.
213 fp = freq_percent > CONSENSUS
214 cons = NArray.byte(@length)
215 cons |= fp[true, ROW_A] *= BIT_A
216 cons |= fp[true, ROW_T] *= BIT_T
217 cons |= fp[true, ROW_C] *= BIT_C
218 cons |= fp[true, ROW_G] *= BIT_G
220 cons *= @count > @count.max * INDEL_RATIO
222 cons.to_s.tr!(TR_HEX, TR_NUC).upcase
226 q = (@qual / (@count + @count.eq(0))).max(1)
228 (q.to_type("byte") + QUAL_BASE).to_s
232 filter = qual_mean > QUAL_MIN