require 'pp'
require 'open3'
require 'narray'
+require 'maasha/align/pair'
require 'maasha/fasta'
-require 'maasha/mum'
class AlignError < StandardError; end;
ALPH_AMBI = %w{A T C G M R W S Y K V H D B N}
BIT_INDEL = 0
-# BIT_A = 1 << 0
-# BIT_T = 1 << 1
-# BIT_C = 1 << 2
-# BIT_G = 1 << 3
+BIT_A = 1 << 0
+BIT_T = 1 << 1
+BIT_C = 1 << 2
+BIT_G = 1 << 3
BIT_M = BIT_A | BIT_C
BIT_R = BIT_A | BIT_G
attr_accessor :options
attr_reader :entries
+ include PairAlign
+
# Class method to align sequences in a list of Seq objects and
# return these as a new list of Seq objects.
def self.muscle(entries, verbose = false)
Open3.popen3("muscle", "-quiet") do |stdin, stdout, stderr|
entries.each do |entry|
- raise AlignError, "Duplicate sequence name: #{entry.seq_name}" if index.has_key? entry.seq_name
+ raise AlignError, "Duplicate sequence name: #{entry.seq_name}" if index[entry.seq_name]
index[entry.seq_name] = entry.dup
# Class method to create a pairwise alignment of two given Seq objects. The
# alignment is created by casting a search space the size of the sequences
- # and save the longest perfect match between the sequences and recurse into
+ # and save the best scoring match between the sequences and recurse into
# the left and right search spaced around this match. When all search spaces
# are exhausted the saved matches are used to insert gaps in the sequences.
def self.pair(q_entry, s_entry)
- q_space_beg = 0
- q_space_end = q_entry.length
- s_space_beg = 0
- s_space_end = s_entry.length
-
- q_entry.seq.downcase!
- s_entry.seq.downcase!
-
- ap = AlignPair.new(q_entry, s_entry)
-
- ap.align(q_space_beg, q_space_end, s_space_beg, s_space_end, 15, [])
-
- ap.upcase_match
- ap.insert_gaps
-
- #ap.dump_bp and exit
+ AlignPair.align(q_entry, s_entry)
self.new([q_entry, s_entry])
end
private
- # Class for creating a pairwise alignment of two specified sequences. The
- # alignment is created by casting a search space the size of the sequences
- # and save the longest perfect match between the sequences and recurse into
- # the left and right search spaced around this match. When all search spaces
- # are exhausted the saved matches are used to insert gaps in the sequences.
- class AlignPair
- attr_reader :matches
-
- # Method to initialize an AlignPair object given two Seq objects.
- def initialize(q_entry, s_entry)
- @q_entry = q_entry
- @s_entry = s_entry
- @matches = []
- end
-
- # Method that locates the longest perfect match in a specified search space
- # by comparing two sequence indeces. The first index is created by hashing
- # for the first sequence the position of all non-overlapping unique oligos
- # with the oligo as key an the position as value. The second index is
- # created for the second sequence by hashing the position of all
- # overlapping unique oligos in a similar way. The longest match is located
- # by comparing these indeces and the longest match is expanded maximally
- # within the specified search space. Finally, the longest match is used
- # to define a left and a right search space which are recursed into
- # until no more matches are found.
- def align(q_space_beg, q_space_end, s_space_beg, s_space_end, kmer_size, matches)
-# puts "search space: #{q_space_beg}-#{q_space_end} x #{s_space_beg}-#{s_space_end}"
- new_matches = []
-
- matches.each do |match|
- new_matches << match if match.q_beg >= q_space_beg and
- match.q_end <= q_space_end and
- match.s_beg >= s_space_beg and
- match.s_end <= s_space_end
- end
-
- while new_matches.empty? and kmer_size > 0
- if @q_entry.length >= kmer_size and @s_entry.length >= kmer_size
- new_matches = MUMs.find(@q_entry, @s_entry, {"q_space_beg" => q_space_beg,
- "q_space_end" => q_space_end,
- "s_space_beg" => s_space_beg,
- "s_space_end" => s_space_end,
- "kmer_size" => kmer_size} )
- end
-
- unless new_matches.empty?
- longest_match = new_matches.sort_by { |match| match.length }.pop
-# pp longest_match
-
- @matches << longest_match
-
- q_space_beg_left = (q_space_beg == 0) ? 0 : q_space_beg + 1
- s_space_beg_left = (s_space_beg == 0) ? 0 : s_space_beg + 1
- q_space_end_left = longest_match.q_beg - 1
- s_space_end_left = longest_match.s_beg - 1
-
- q_space_beg_right = longest_match.q_end + 1
- s_space_beg_right = longest_match.s_end + 1
- q_space_end_right = (q_space_end == @q_entry.length) ? @q_entry.length : q_space_end - 1
- s_space_end_right = (s_space_end == @s_entry.length) ? @s_entry.length : s_space_end - 1
-
- if q_space_end_left - q_space_beg_left > 0 and s_space_end_left - s_space_beg_left > 0
- align(q_space_beg_left, q_space_end_left, s_space_beg_left, s_space_end_left, kmer_size, new_matches)
- end
-
- if q_space_end_right - q_space_beg_right > 0 and s_space_end_right - s_space_beg_right > 0
- align(q_space_beg_right, q_space_end_right, s_space_beg_right, s_space_end_right, kmer_size, new_matches)
- end
- end
-
- kmer_size /= 2
- end
- end
-
- # Method for debugging purposes that upcase matching sequence while non-matches
- # sequence is kept in lower case.
- def upcase_match
- @matches.each do |match|
- @q_entry.seq[match.q_beg .. match.q_end] = @q_entry.seq[match.q_beg .. match.q_end].upcase
- @s_entry.seq[match.s_beg .. match.s_end] = @s_entry.seq[match.s_beg .. match.s_end].upcase
- end
- end
-
- # Method that insert gaps in sequences based on a list of matches and thus
- # creating an alignment.
- # TODO check boundaries!
- def insert_gaps
- @matches.sort_by! { |m| m.q_beg }
-
- q_gaps = 0
- s_gaps = 0
-
- @matches.each do |match|
- diff = (q_gaps + match.q_beg) - (s_gaps + match.s_beg)
-
- #pp "q_gaps #{q_gaps} s_gaps #{s_gaps} diff #{diff}"
-
- if diff < 0
- @q_entry.seq.insert(match.q_beg + q_gaps, "-" * diff.abs)
- q_gaps += diff.abs
- elsif diff > 0
- @s_entry.seq.insert(match.s_beg + s_gaps, "-" * diff.abs)
- s_gaps += diff.abs
- end
- end
-
- diff = @q_entry.length - @s_entry.length
-
- if diff < 0
- @q_entry.seq << ("-" * diff.abs)
- else
- @s_entry.seq << ("-" * diff.abs)
- end
- end
-
- # Method that dumps all matches as Biopiece records for use with
- # plot_matches.
- def dump_bp
- @matches.sort_by! { |m| m.q_beg }
-
- @matches.each { |m|
- puts "Q_BEG: #{m.q_beg}"
- puts "Q_END: #{m.q_end}"
- puts "S_BEG: #{m.s_beg}"
- puts "S_END: #{m.s_end}"
- puts "STRAND: +"
- puts "---"
- }
- end
- end
-
class Consensus
# Method to initialize a Consensus object given a list of aligned Seq object.
def initialize(entries, options)
new_seq = Seq.new
new_seq.seq = consensus_seq
new_seq.qual = consensus_qual if @has_qual
- new_seq.type = "dna"
+ new_seq.type = :dna
new_seq
end
def na_add_entries
@entries.each_with_index do |entry, i|
@na_seq[true, i] = NArray.to_na(entry.seq.downcase.tr(TR_NUC, TR_HEX), "byte")
- @na_qual[true, i] = NArray.to_na(entry.qual, "byte") - SCORE_BASE if @has_qual
+ @na_qual[true, i] = NArray.to_na(entry.qual, "byte") - Seq::SCORE_BASE if @has_qual
end
end
# NArrays.
def consensus_calc
if @has_qual
- if @options.has_key? :quality_min
+ if @options[:quality_min]
mask = mask_quality_min
@na_seq *= mask
@na_qual *= mask
end
- if @options.has_key? :quality_mean
+ if @options[:quality_mean]
mask = mask_quality_mean
@na_seq *= mask
end
end
- if @options.has_key? :sequence_min
+ if @options[:sequence_min]
mask = mask_sequence_min
@na_seq *= mask
@na_qual *= mask if @has_qual
end
- if @options.has_key? :gap_max
+ if @options[:gap_max]
mask = mask_gap_max
@na_seq *= mask
@na_qual *= mask if @has_qual
end
- if @options.has_key? :residue_min
+ if @options[:residue_min]
mask = mask_residue_min
@na_seq *= mask
# Method to calculate a consensus quality from a Consensus object.
def consensus_qual
- (@na_qual.mean(1).round.to_type("byte") + SCORE_BASE).to_s
+ (@na_qual.mean(1).round.to_type("byte") + Seq::SCORE_BASE).to_s
end
end
end