--- /dev/null
+#!/usr/bin/env ruby
+
+# Copyright (C) 2007-2012 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# This program is part of the Biopieces framework (www.biopieces.org).
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# Replace values to a specied key for each record in the stream.
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+require 'maasha/biopieces'
+require 'maasha/seq'
+require 'inline'
+
+# Class containing methods to locate over-represented kmers in sequences.
+class KmerStats
+ NUC_ALPH_SIZE = 4
+ BYTES_IN_INT = 4
+
+ # Method to initialize a KmerStats object.
+ def initialize(kmer_size)
+ @kmer_size = kmer_size
+
+ raise ArgumentError, "kmer_size: #{kmer_size} > 16" if kmer_size > 16
+
+ @kmer_tot = 0
+ @freq_tot = 0
+ @kmer_index_size = NUC_ALPH_SIZE ** kmer_size
+ @freq_index_size = NUC_ALPH_SIZE
+ @kmer_index = "\0" * @kmer_index_size * BYTES_IN_INT
+ @freq_index = "\0" * @freq_index_size * BYTES_IN_INT
+ end
+
+ # Method to add a sequence to the KmerStats.
+ def add(seq)
+ @kmer_tot += add_C(seq, seq.length, @kmer_size, @kmer_index, @freq_index)
+ @freq_tot += seq.length
+ end
+
+ # Method for iterating the KmerStats and return.
+ # each kmer as a binary value.
+ def each
+ (0 ... @kmer_index_size).each do |bin|
+ yield bin
+ end
+ end
+
+ # Method to determine the probability of an observed binary kmer.
+ def p_kmer(bin)
+ p_kmer_C(bin, @kmer_tot, @kmer_index)
+ end
+
+ # Method to determine the probablity of a theoretical binary kmer.
+ def p_freq(bin)
+ p_freq_C(bin, @freq_tot, @kmer_size, @freq_index)
+ end
+
+ # Method that converts a binary kmer to DNA.
+ def kmer(bin)
+ dna = ""
+ mask = 3
+
+ (0 ... @kmer_size).each do
+ case bin & mask
+ when 1 then dna << "T"
+ when 2 then dna << "C"
+ when 3 then dna << "G"
+ else
+ dna << "A"
+ end
+
+ bin >>= 2
+ end
+
+ dna.reverse
+ end
+
+ # Method that returns the count of a given binary kmer.
+ def count(bin)
+ count_C(bin, @kmer_index)
+ end
+
+ private
+
+ inline do |builder|
+ builder.prefix %{
+ unsigned char nuc2bin[256] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ };
+ }
+
+ builder.c %{
+ VALUE add_C(
+ VALUE _seq,
+ VALUE _seq_len,
+ VALUE _kmer_size,
+ VALUE _kmer_index,
+ VALUE _freq_index
+ )
+ {
+ unsigned char *seq = (unsigned char *) StringValuePtr(_seq);
+ unsigned int seq_len = NUM2UINT(_seq_len);
+ unsigned int kmer_size = NUM2UINT(_kmer_size);
+ unsigned int *kmer_index = (unsigned int *) StringValuePtr(_kmer_index);
+ unsigned int *freq_index = (unsigned int *) StringValuePtr(_freq_index);
+
+ unsigned int mask = (1 << (2 * kmer_size)) - 1;
+ unsigned int bin = 0;
+ unsigned int val = 0;
+ unsigned int i = 0;
+
+ for (i = 0; i < kmer_size; i++)
+ {
+ bin <<= 2;
+ val = nuc2bin[seq[i]];
+ bin |= val;
+ freq_index[val]++;
+ }
+
+ kmer_index[bin]++;
+
+ while (i < seq_len)
+ {
+ bin <<= 2;
+ val = nuc2bin[seq[i]];
+ bin |= val;
+
+ kmer_index[bin & mask]++;
+ freq_index[val]++;
+
+ i++;
+ }
+
+ return UINT2NUM(i - kmer_size + 1);
+ }
+ }
+
+ builder.c %{
+ VALUE freq_count_C(
+ VALUE _seq,
+ VALUE _seq_len,
+ VALUE _index
+ )
+ {
+ unsigned char *seq = (unsigned char *) StringValuePtr(_seq);
+ unsigned int seq_len = NUM2UINT(_seq_len);
+ unsigned int *index = (unsigned int *) StringValuePtr(_index);
+
+ unsigned int i = 0;
+
+ for (i = 0; i < seq_len; i++)
+ {
+ index[seq[i]]++;
+ }
+
+ return UINT2NUM(i);
+ }
+ }
+
+ builder.c %{
+ VALUE p_kmer_C(
+ VALUE _bin,
+ VALUE _kmer_tot,
+ VALUE _kmer_index
+ )
+ {
+ unsigned int bin = NUM2UINT(_bin);
+ unsigned int kmer_tot = NUM2UINT(_kmer_tot);
+ unsigned int *kmer_index = (unsigned int *) StringValuePtr(_kmer_index);
+
+ double p_kmer = 0.0;
+
+ p_kmer = (double) kmer_index[bin] / kmer_tot;
+
+ return DBL2NUM(p_kmer);
+ }
+ }
+
+ builder.c %{
+ VALUE p_freq_C(
+ VALUE _bin,
+ VALUE _freq_tot,
+ VALUE _kmer_size,
+ VALUE _freq_index
+ )
+ {
+ unsigned int bin = NUM2UINT(_bin);
+ unsigned int freq_tot = NUM2UINT(_freq_tot);
+ unsigned int kmer_size = NUM2UINT(_kmer_size);
+ unsigned int *freq_index = (unsigned int *) StringValuePtr(_freq_index);
+
+ unsigned int mask = 3;
+ unsigned int i = 0;
+ unsigned int c = 0;
+ double p_freq = 1.0;
+
+ for (i = 0; i < kmer_size; i++)
+ {
+ p_freq *= (double) freq_index[bin & mask] / freq_tot;
+
+ bin >>= 2;
+ }
+
+ return DBL2NUM(p_freq);
+ }
+ }
+
+ builder.c %{
+ VALUE count_C(
+ VALUE _bin,
+ VALUE _kmer_index
+ )
+ {
+ unsigned int bin = NUM2UINT(_bin);
+ unsigned int *kmer_index = (unsigned int *) StringValuePtr(_kmer_index);
+
+ unsigned int count = kmer_index[bin];
+
+ return UINT2NUM(count);
+ }
+ }
+ end
+end
+
+casts = []
+casts << {:long=>'size', :short=>'s', :type=>'uint', :mandatory=>true, :default=>10, :allowed=>nil, :disallowed=>"0"}
+casts << {:long=>'min_ratio', :short=>'m', :type=>'float', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil}
+
+options = Biopieces.options_parse(ARGV, casts)
+
+kmer_stats = KmerStats.new(options[:size])
+
+Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
+ input.each_record do |record|
+ output.puts record
+
+ if record[:SEQ]
+ kmer_stats.add(record[:SEQ])
+ end
+ end
+
+ kmer_stats.each do |bin|
+ p_kmer = kmer_stats.p_kmer(bin)
+
+ next if p_kmer == 0.0
+
+ p_freq = kmer_stats.p_freq(bin)
+
+ ratio = (p_kmer / p_freq).round(2)
+
+ if ratio >= options[:min_ratio]
+ kmer = kmer_stats.kmer(bin)
+ count = kmer_stats.count(bin)
+
+ record = {
+ :REC_TYPE => "KMER_STATS",
+ :KMER => kmer,
+ :COUNT => count,
+ :RATIO => ratio
+ }
+
+ output.puts record
+ end
+ end
+end
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+__END__