--- /dev/null
+# Copyright (C) 2007-2011 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# This software is part of the Biopieces framework (www.biopieces.org).
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+require 'pp'
+require 'open3'
+require 'narray'
+require 'maasha/fasta'
+
+class AlignError < StandardError; end;
+
+CONSENSUS = 20
+INDEL_RATIO = 0.5
+QUAL_MIN = 15
+QUAL_BASE = 64
+ALPH_DNA = %w{A T C G}
+ALPH_AMBI = %w{A T C G M R W S Y K V H D B N}
+
+BIT_INDEL = 0
+BIT_A = 1 << 0
+BIT_T = 1 << 1
+BIT_C = 1 << 2
+BIT_G = 1 << 3
+BIT_M = BIT_A | BIT_C
+BIT_R = BIT_A | BIT_G
+BIT_W = BIT_A | BIT_T
+BIT_S = BIT_C | BIT_G
+BIT_Y = BIT_C | BIT_T
+BIT_K = BIT_G | BIT_T
+BIT_V = BIT_A | BIT_C | BIT_G
+BIT_H = BIT_A | BIT_C | BIT_T
+BIT_D = BIT_A | BIT_G | BIT_T
+BIT_B = BIT_C | BIT_G | BIT_T
+BIT_N = BIT_G | BIT_A | BIT_T | BIT_C
+
+BITMAP = [
+ BIT_A,
+ BIT_T,
+ BIT_C,
+ BIT_G,
+ BIT_M,
+ BIT_R,
+ BIT_W,
+ BIT_S,
+ BIT_Y,
+ BIT_K,
+ BIT_V,
+ BIT_H,
+ BIT_D,
+ BIT_B,
+ BIT_N
+]
+
+TR_NUC = "-" + ALPH_AMBI.join("").downcase
+TR_HEX = [BIT_INDEL].pack("C") + BITMAP.pack("C*")
+
+ROW_A = 0
+ROW_T = 1
+ROW_C = 2
+ROW_G = 3
+
+# Adding an initialize method to the existing Fasta class.
+class Fasta
+ def initialize(io)
+ @io = io
+ end
+end
+
+# Class for aligning sequences.
+class Align
+ # Class method to align sequences in a list of Seq objects and
+ # return these as a new list of Seq objects.
+ def self.muscle(entries)
+ cmd = "muscle"
+ aligned = []
+
+ Open3.popen3(cmd) do |stdin, stdout, stderr|
+ entries.each { |entry| stdin.puts entry.to_fasta }
+ stdin.close
+
+ fa = Fasta.new(stdout)
+
+ fa.each do |align|
+ if block_given?
+ yield align
+ else
+ aligned << align
+ end
+ end
+ end
+
+ return aligned unless block_given?
+ end
+
+ # Class method to create a consensus sequence from a list of Seq objects and
+ # return a new Seq object with the consensus.
+ def self.consensus(entries)
+ cons = Consensus.new(entries.first.length)
+
+ entries.each { |entry| cons.add(entry) }
+
+ cons.to_seq
+ end
+
+ class Consensus
+ # Method to initialize a Consensus object given a Seq object, which is added
+ # to the Consensus.
+ def initialize(length)
+ @length = length
+
+ @count = NArray.int(@length)
+ @freq = NArray.int(@length, ALPH_DNA.size)
+ @qual = NArray.int(@length, ALPH_DNA.size)
+ @mask_A = NArray.byte(@length).fill!(BIT_A)
+ @mask_T = NArray.byte(@length).fill!(BIT_T)
+ @mask_C = NArray.byte(@length).fill!(BIT_C)
+ @mask_G = NArray.byte(@length).fill!(BIT_G)
+ @has_qual = false
+ end
+
+ # Method to add a Seq entry to a Consensus object.
+ def add(entry)
+ seq = NArray.to_na(entry.seq.downcase.tr(TR_NUC, TR_HEX), "byte")
+
+ @count += seq > 0
+
+ mask_A = (seq & @mask_A) > 0
+ mask_T = (seq & @mask_T) > 0
+ mask_C = (seq & @mask_C) > 0
+ mask_G = (seq & @mask_G) > 0
+
+ @freq[true, ROW_A] += mask_A
+ @freq[true, ROW_T] += mask_T
+ @freq[true, ROW_C] += mask_C
+ @freq[true, ROW_G] += mask_G
+
+ unless entry.qual.nil?
+ @has_qual = true
+
+ qual = NArray.to_na(entry.qual, "byte") - QUAL_BASE
+
+ @qual[true, ROW_A] += mask_A * qual
+ @qual[true, ROW_T] += mask_T * qual
+ @qual[true, ROW_C] += mask_C * qual
+ @qual[true, ROW_G] += mask_G * qual
+ end
+ end
+
+ def to_seq
+ #qual_filter
+ new_seq = Seq.new
+ new_seq.seq = consensus_seq
+ new_seq.qual = consensus_qual if @has_qual
+ new_seq.type = "dna"
+
+ new_seq
+ end
+
+ private
+
+ # Method that calculates the sum for each column except the indel row and
+ # returns the sum in an NArray.
+ def freq_total
+ @freq.sum(1)
+ end
+
+ # Method that locates the maximum value for each column and
+ # returns this in an NArray.
+ def freq_max
+ @freq.max(1)
+ end
+
+ def qual_mean
+ freq = @freq[true, [0 ... ALPH_DNA.size]]
+ @qual / (freq + freq.eq(0))
+ end
+
+ def freq_percent
+ (@freq / freq_total.to_type("float") * 100).round
+ end
+
+ # Method to calculate a consensus sequence from a Consensus object.
+ def consensus_seq
+ fp = freq_percent > CONSENSUS
+ cons = NArray.byte(@length)
+ cons |= fp[true, ROW_A] *= BIT_A
+ cons |= fp[true, ROW_T] *= BIT_T
+ cons |= fp[true, ROW_C] *= BIT_C
+ cons |= fp[true, ROW_G] *= BIT_G
+
+ cons *= @count > @count.max * INDEL_RATIO
+
+ cons.to_s.tr!(TR_HEX, TR_NUC).upcase
+ end
+
+ def consensus_qual
+ q = (@qual / (@count + @count.eq(0))).max(1)
+
+ (q.to_type("byte") + QUAL_BASE).to_s
+ end
+
+ def qual_filter
+ filter = qual_mean > QUAL_MIN
+
+ # pp @freq
+ # pp @qual
+
+ # @freq *= filter
+ # @qual *= filter
+
+ # pp @freq
+ # pp @qual
+ end
+ end
+end
+
+
+
+__END__