1 # Copyright (C) 2007-2011 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
28 require 'maasha/fasta'
30 class AlignError < StandardError; end;
32 ALPH_DNA = %w{A T C G}
33 ALPH_AMBI = %w{A T C G M R W S Y K V H D B N}
47 BIT_V = BIT_A | BIT_C | BIT_G
48 BIT_H = BIT_A | BIT_C | BIT_T
49 BIT_D = BIT_A | BIT_G | BIT_T
50 BIT_B = BIT_C | BIT_G | BIT_T
51 BIT_N = BIT_G | BIT_A | BIT_T | BIT_C
71 TR_NUC = "-" + ALPH_AMBI.join("").downcase
72 TR_HEX = [BIT_INDEL].pack("C") + BITMAP.pack("C*")
89 # Class for aligning sequences.
91 attr_accessor :options
94 # Class method to align sequences in a list of Seq objects and
95 # return these as a new list of Seq objects.
96 def self.muscle(entries, verbose = false)
100 Open3.popen3("muscle", "-quiet") do |stdin, stdout, stderr|
101 entries.each do |entry|
102 raise AlignError, "Duplicate sequence name: #{entry.seq_name}" if index.has_key? entry.seq_name
104 index[entry.seq_name] = entry.dup
106 stdin.puts entry.to_fasta
111 stderr.each_line { |line| $stderr.puts line } if verbose
113 aligned_entries = Fasta.new(stdout)
115 aligned_entries.each do |fa_entry|
116 fq_entry = index[fa_entry.seq_name]
118 fa_entry.seq.scan(/-+/) do |m|
119 fq_entry.seq = fq_entry.seq[0 ... $`.length] + ('-' * m.length) + fq_entry.seq[$`.length .. -1]
120 fq_entry.qual = fq_entry.qual[0 ... $`.length] + ('@' * m.length) + fq_entry.qual[$`.length .. -1] unless fq_entry.qual.nil?
130 # Class method to create a pairwise alignment of two given Seq objects. The
131 # alignment is created by casting a search space the size of the sequences
132 # and save the longest perfect match between the sequences and recurse into
133 # the left and right search spaced around this match. When all search spaces
134 # are exhausted the saved matches are used to insert gaps in the sequences.
135 def self.pair(q_entry, s_entry)
137 q_space_end = q_entry.length
139 s_space_end = s_entry.length
141 q_entry.seq.downcase!
142 s_entry.seq.downcase!
144 ap = AlignPair.new(q_entry.seq, s_entry.seq)
146 ap.align(q_space_beg, q_space_end, s_space_beg, s_space_end, 256)
153 self.new([q_entry, s_entry])
156 # Method to initialize an Align object with a list of aligned Seq objects.
157 def initialize(entries, options = {})
162 # Method that returns the length of the alignment.
164 @entries.first.length
167 # Method that returns the number of members or sequences in the alignment.
172 # Method that returns the identity of an alignment with two members.
175 raise AlignError "Bad number of members for similarity calculation: #{self.members}"
178 na1 = NArray.to_na(@entries[0].seq.upcase, "byte")
179 na2 = NArray.to_na(@entries[1].seq.upcase, "byte")
181 shared = (na1 - na2).count_false
182 total = (@entries[0].length < @entries[1].length) ? @entries[0].length : @entries[1].length
183 identity = shared.to_f / total
188 # Method to create a consensus sequence from an Align object and
189 # return a new Seq object with the consensus.
191 cons = Consensus.new(@entries, @options)
195 # Method to pretty print an alignment from an Align object.
197 cons = Consensus.new(@entries, @options)
200 max_name = @entries.group_by { |entry| entry.seq_name.length }.max.first
204 @entries.each do |entry|
205 output << entry.seq_name + (" " * (max_name + 3 - entry.seq_name.length )) + entry.seq + $/
208 cons_entry = cons.consensus
210 output << " " * (max_name + 3) + cons_entry.seq
211 output << $/ + " " * (max_name + 3) + cons_entry.qual.tr("[@-h]", " ..........ooooooooooOOOOOOOOOO") unless cons_entry.qual.nil?
215 # Method for iterating each of the aligned sequences.
218 @entries.each { |entry| yield entry }
226 # Class for creating a pairwise alignment of two specified sequences. The
227 # alignment is created by casting a search space the size of the sequences
228 # and save the longest perfect match between the sequences and recurse into
229 # the left and right search spaced around this match. When all search spaces
230 # are exhausted the saved matches are used to insert gaps in the sequences.
234 # Method to initialize an AlignPair object given two sequence strings.
235 def initialize(q_seq, s_seq)
241 # Method that locates the longest perfect match in a specified search space
242 # by comparing two sequence indeces. The first index is created by hashing
243 # for the first sequence the position of all non-overlapping unique oligos
244 # with the oligo as key an the position as value. The second index is
245 # created for the second sequence by hashing the position of all
246 # overlapping unique oligos in a similar way. The longest match is located
247 # by comparing these indeces and the longest match is expanded maximally
248 # within the specified search space. Finally, the longest match is used
249 # to define a left and a right search space which are recursed into
250 # until no more matches are found.
251 def align(q_space_beg, q_space_end, s_space_beg, s_space_end, length)
252 # puts "search space: #{q_space_beg}-#{q_space_end} x #{s_space_beg}-#{s_space_end}"
255 while new_matches.empty? and length > 0
256 if @q_seq.length >= length and @s_seq.length >= length
257 q_index = index_seq(q_space_beg, q_space_end, @q_seq, length, length)
258 s_index = index_seq(s_space_beg, s_space_end, @s_seq, length, 1)
259 new_matches = find_matches(q_index, s_index, length)
262 unless new_matches.empty?
263 longest_match = new_matches.sort_by { |match| match.length }.last
264 longest_match.expand(@q_seq, @s_seq, q_space_beg, q_space_end, s_space_beg, s_space_end)
267 @matches << longest_match
269 q_space_beg_left = (q_space_beg == 0) ? 0 : q_space_beg + 1
270 s_space_beg_left = (s_space_beg == 0) ? 0 : s_space_beg + 1
271 q_space_end_left = longest_match.q_beg - 1
272 s_space_end_left = longest_match.s_beg - 1
274 q_space_beg_right = longest_match.q_end + 1
275 s_space_beg_right = longest_match.s_end + 1
276 q_space_end_right = (q_space_end == @q_seq.length) ? @q_seq.length : q_space_end - 1
277 s_space_end_right = (s_space_end == @s_seq.length) ? @s_seq.length : s_space_end - 1
279 if q_space_end_left - q_space_beg_left > 0 and s_space_end_left - s_space_beg_left > 0
280 align(q_space_beg_left, q_space_end_left, s_space_beg_left, s_space_end_left, length)
283 if q_space_end_right - q_space_beg_right > 0 and s_space_end_right - s_space_beg_right > 0
284 align(q_space_beg_right, q_space_end_right, s_space_beg_right, s_space_end_right, length)
292 # Method for debugging purposes that upcase matching sequence while non-matches
293 # sequence is kept in lower case.
295 @matches.each do |match|
296 @q_seq[match.q_beg .. match.q_end] = @q_seq[match.q_beg .. match.q_end].upcase
297 @s_seq[match.s_beg .. match.s_end] = @s_seq[match.s_beg .. match.s_end].upcase
301 # Method that insert gaps in sequences based on a list of matches and thus
302 # creating an alignment.
303 # TODO check boundaries!
305 @matches.sort_by! { |m| m.q_beg }
310 @matches.each do |match|
311 diff = (q_gaps + match.q_beg) - (s_gaps + match.s_beg)
313 #pp "q_gaps #{q_gaps} s_gaps #{s_gaps} diff #{diff}"
316 @q_seq.insert(match.q_beg + q_gaps, "-" * diff.abs)
319 @s_seq.insert(match.s_beg + s_gaps, "-" * diff.abs)
324 diff = @q_seq.length - @s_seq.length
327 @q_seq << ("-" * diff.abs)
329 @s_seq << ("-" * diff.abs)
333 # Method that dumps all matches as Biopiece records for use with
336 @matches.sort_by! { |m| m.q_beg }
339 puts "Q_BEG: #{m.q_beg}"
340 puts "Q_END: #{m.q_end}"
341 puts "S_BEG: #{m.s_beg}"
342 puts "S_END: #{m.s_end}"
350 # Method for indexing a sequence given a search space, oligo size
351 # and step size. All oligo positions are saved in a lookup hash
352 # where the oligo is the key and the position is the value. Non-
353 # unique oligos are removed and the index returned.
354 def index_seq(space_beg, space_end, seq, oligo_size, step_size)
356 index_count = Hash.new(0)
358 (space_beg ... space_end).step(step_size).each do |pos|
359 oligo = seq[pos ... pos + oligo_size]
361 if oligo.length == oligo_size
363 index_count[oligo] += 1
367 index_count.each do |oligo, count|
368 index.delete(oligo) if count > 1
374 # Method for locating matches between two indexes where one is created
375 # using non-overlapping unique oligos and their positions and one is
376 # using overlapping unique oligos and positions. Thus iterating over the
377 # non-overlapping oligos and checking last match, this can be extended.
378 def find_matches(q_index, s_index, length)
381 q_index.each do |oligo, q_pos|
383 s_pos = s_index[oligo]
385 last_match = matches.last
386 new_match = Match.new(q_pos, s_pos, length)
389 last_match.q_beg + last_match.length == new_match.q_beg and
390 last_match.s_beg + last_match.length == new_match.s_beg
392 last_match.length += length
402 # Class for Match objects.
404 attr_accessor :length
405 attr_reader :q_beg, :s_beg
407 def initialize(q_beg, s_beg, length)
422 "q_beg=#{q_beg} q_end=#{q_end} s_beg=#{s_beg} s_end=#{s_end} length=#{length}"
425 # Method to expand a match as far as possible to the left and right
426 # within a given search space.
427 def expand(q_seq, s_seq, q_space_beg, q_space_end, s_space_beg, s_space_end)
428 expand_left(q_seq, s_seq, q_space_beg, s_space_beg)
429 expand_right(q_seq, s_seq, q_space_end, s_space_end)
434 # Method to expand a match as far as possible to the left within a given
436 def expand_left(q_seq, s_seq, q_space_beg, s_space_beg)
437 while @q_beg > q_space_beg and @s_beg > s_space_beg and q_seq[@q_beg - 1] == s_seq[@s_beg - 1]
444 # Method to expand a match as far as possible to the right within a given
446 def expand_right(q_seq, s_seq, q_space_end, s_space_end)
447 while q_end + 1 <= q_space_end and s_end + 1 <= s_space_end and q_seq[q_end + 1] == s_seq[s_end + 1]
455 # Method to initialize a Consensus object given a list of aligned Seq object.
456 def initialize(entries, options)
460 @cols = entries.first.seq.length
463 @has_qual = entries.first.qual.nil? ? false : true
465 @na_seq = NArray.byte(@cols, @rows)
466 @na_qual = NArray.byte(@cols, @rows) if @has_qual
472 # Method that lowercase residues that have been removed during
473 # the determination of the consensus sequence.
475 na_seq = NArray.byte(@cols, @rows)
477 @entries.each_with_index do |entry, i|
478 na_seq[true, i] = NArray.to_na(entry.seq.upcase, "byte")
481 na_seq += ((na_seq.ne('-'.ord) - @na_seq.ne(0)) * ' '.ord)
483 @entries.each_with_index do |entry, i|
484 entry.seq = na_seq[true, i].to_s
488 # Method that returns a Sequence object with a consensus sequence
489 # for the entries in an Align object.
492 new_seq.seq = consensus_seq
493 new_seq.qual = consensus_qual if @has_qual
501 # Method to add a Seq entry object to the two NArrays; @na_seq and @na_qual
503 @entries.each_with_index do |entry, i|
504 @na_seq[true, i] = NArray.to_na(entry.seq.downcase.tr(TR_NUC, TR_HEX), "byte")
505 @na_qual[true, i] = NArray.to_na(entry.qual, "byte") - SCORE_BASE if @has_qual
509 # Method to calculate a consensus sequence from a list of sequenced stored in two
513 if @options.has_key? :quality_min
514 mask = mask_quality_min
520 if @options.has_key? :quality_mean
521 mask = mask_quality_mean
528 if @options.has_key? :sequence_min
529 mask = mask_sequence_min
532 @na_qual *= mask if @has_qual
535 if @options.has_key? :gap_max
539 @na_qual *= mask if @has_qual
542 if @options.has_key? :residue_min
543 mask = mask_residue_min
546 @na_qual *= mask if @has_qual
550 # Mask that indicates which columns have more than sequence_min sequences.
551 # Columns with less than sequence_min are 0'ed, otherwise set to 1.
552 def mask_sequence_min
553 mask = NArray.byte(@cols, @rows) + 1
554 mask *= ((@na_seq > 0).to_type("int").sum(1) >= @options[:sequence_min])
558 # Mask that indicates which residue frequencies that are above the residue_min.
559 # The residue frequencies are calculated for each column and residue type as the
560 # number of each residue type over the sum of all non-gap residues in that column.
561 # Positions with residues above the residue_min are indicated with 1.
563 cons_min = @options[:residue_min]
564 factor = 1 / @na_seq.ne(0).to_type("float").sum(1)
566 mask_A = (@na_seq & BIT_A > 0).to_type("int")
567 mask_T = (@na_seq & BIT_T > 0).to_type("int")
568 mask_C = (@na_seq & BIT_C > 0).to_type("int")
569 mask_G = (@na_seq & BIT_G > 0).to_type("int")
571 mask_A = (mask_A * mask_A.sum(1)) * factor >= cons_min
572 mask_T = (mask_T * mask_T.sum(1)) * factor >= cons_min
573 mask_C = (mask_C * mask_C.sum(1)) * factor >= cons_min
574 mask_G = (mask_G * mask_G.sum(1)) * factor >= cons_min
576 mask_A | mask_T | mask_C | mask_G
579 # Mask that indicates which columns contain fewer gaps than max_gap.
580 # Columns with more gaps are 0'ed, others are set to 1.
582 mask = NArray.byte(@cols, @rows) + 1
583 mask *= @na_seq.ne(0).to_type("float").sum(1) / @rows > @options[:gap_max]
588 # Mask that indicates which residues in an alignment are above quality_min.
589 # Positions with subquality are 0'ed - all others are set to 1.
591 @na_qual > @options[:quality_min]
594 # Mask that indicates which columns have a quality mean above quality_mean which
595 # is the mean of all non-gap quality residues in that column. Columns with less then
596 # quality_mean are 0'ed, otherwise set to 1.
597 def mask_quality_mean
598 mask = NArray.byte(@cols, @rows) + 1
599 residues = @na_seq.ne(0).to_type("int").sum(1)
600 quality = @na_qual.to_type("float").sum(1)
602 mask * (quality / residues).round > @options[:quality_mean]
605 # Method to calculate a consensus sequence from a Consensus object.
607 cons = NArray.byte(@cols)
608 cons |= (@na_seq & BIT_A).max(1)
609 cons |= (@na_seq & BIT_T).max(1)
610 cons |= (@na_seq & BIT_C).max(1)
611 cons |= (@na_seq & BIT_G).max(1)
613 cons.to_s.tr!(TR_HEX, TR_NUC).upcase
616 # Method to calculate a consensus quality from a Consensus object.
618 (@na_qual.mean(1).round.to_type("byte") + SCORE_BASE).to_s