+++ /dev/null
-#!/usr/bin/env ruby
-
-# Copyright (C) 2007-2012 Martin A. Hansen.
-
-# This program is free software; you can redistribute it and/or
-# modify it under the terms of the GNU General Public License
-# as published by the Free Software Foundation; either version 2
-# of the License, or (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-
-# http://www.gnu.org/copyleft/gpl.html
-
-# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# This program is part of the Biopieces framework (www.biopieces.org).
-
-# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# Replace values to a specied key for each record in the stream.
-
-# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-
-require 'maasha/biopieces'
-require 'maasha/seq'
-require 'inline'
-
-# Class containing methods to locate over-represented kmers in sequences.
-class KmerStats
- NUC_ALPH_SIZE = 4
- BYTES_IN_INT = 4
-
- # Method to initialize a KmerStats object.
- def initialize(kmer_size)
- @kmer_size = kmer_size
-
- raise ArgumentError, "kmer_size: #{kmer_size} > 16" if kmer_size > 16
-
- @kmer_tot = 0
- @freq_tot = 0
- @kmer_index_size = NUC_ALPH_SIZE ** kmer_size
- @freq_index_size = NUC_ALPH_SIZE
- @kmer_index = "\0" * @kmer_index_size * BYTES_IN_INT
- @freq_index = "\0" * @freq_index_size * BYTES_IN_INT
- end
-
- # Method to add a sequence to the KmerStats.
- def add(seq)
- @kmer_tot += add_C(seq, seq.length, @kmer_size, @kmer_index, @freq_index)
- @freq_tot += seq.length
- end
-
- # Method for iterating the KmerStats and return.
- # each kmer as a binary value.
- def each
- (0 ... @kmer_index_size).each do |bin|
- yield bin
- end
- end
-
- # Method to determine the probability of an observed binary kmer.
- def p_kmer(bin)
- p_kmer_C(bin, @kmer_tot, @kmer_index)
- end
-
- # Method to determine the probablity of a theoretical binary kmer.
- def p_freq(bin)
- p_freq_C(bin, @freq_tot, @kmer_size, @freq_index)
- end
-
- # Method that converts a binary kmer to DNA.
- def kmer(bin)
- dna = ""
- mask = 3
-
- (0 ... @kmer_size).each do
- case bin & mask
- when 1 then dna << "T"
- when 2 then dna << "C"
- when 3 then dna << "G"
- else
- dna << "A"
- end
-
- bin >>= 2
- end
-
- dna.reverse
- end
-
- # Method that returns the count of a given binary kmer.
- def count(bin)
- count_C(bin, @kmer_index)
- end
-
- private
-
- inline do |builder|
- builder.prefix %{
- unsigned char nuc2bin[256] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- };
- }
-
- builder.c %{
- VALUE add_C(
- VALUE _seq,
- VALUE _seq_len,
- VALUE _kmer_size,
- VALUE _kmer_index,
- VALUE _freq_index
- )
- {
- unsigned char *seq = (unsigned char *) StringValuePtr(_seq);
- unsigned int seq_len = NUM2UINT(_seq_len);
- unsigned int kmer_size = NUM2UINT(_kmer_size);
- unsigned int *kmer_index = (unsigned int *) StringValuePtr(_kmer_index);
- unsigned int *freq_index = (unsigned int *) StringValuePtr(_freq_index);
-
- unsigned int mask = (1 << (2 * kmer_size)) - 1;
- unsigned int bin = 0;
- unsigned int val = 0;
- unsigned int i = 0;
-
- for (i = 0; i < kmer_size; i++)
- {
- bin <<= 2;
- val = nuc2bin[seq[i]];
- bin |= val;
- freq_index[val]++;
- }
-
- kmer_index[bin]++;
-
- while (i < seq_len)
- {
- bin <<= 2;
- val = nuc2bin[seq[i]];
- bin |= val;
-
- kmer_index[bin & mask]++;
- freq_index[val]++;
-
- i++;
- }
-
- return UINT2NUM(i - kmer_size + 1);
- }
- }
-
- builder.c %{
- VALUE freq_count_C(
- VALUE _seq,
- VALUE _seq_len,
- VALUE _index
- )
- {
- unsigned char *seq = (unsigned char *) StringValuePtr(_seq);
- unsigned int seq_len = NUM2UINT(_seq_len);
- unsigned int *index = (unsigned int *) StringValuePtr(_index);
-
- unsigned int i = 0;
-
- for (i = 0; i < seq_len; i++)
- {
- index[seq[i]]++;
- }
-
- return UINT2NUM(i);
- }
- }
-
- builder.c %{
- VALUE p_kmer_C(
- VALUE _bin,
- VALUE _kmer_tot,
- VALUE _kmer_index
- )
- {
- unsigned int bin = NUM2UINT(_bin);
- unsigned int kmer_tot = NUM2UINT(_kmer_tot);
- unsigned int *kmer_index = (unsigned int *) StringValuePtr(_kmer_index);
-
- double p_kmer = 0.0;
-
- p_kmer = (double) kmer_index[bin] / kmer_tot;
-
- return DBL2NUM(p_kmer);
- }
- }
-
- builder.c %{
- VALUE p_freq_C(
- VALUE _bin,
- VALUE _freq_tot,
- VALUE _kmer_size,
- VALUE _freq_index
- )
- {
- unsigned int bin = NUM2UINT(_bin);
- unsigned int freq_tot = NUM2UINT(_freq_tot);
- unsigned int kmer_size = NUM2UINT(_kmer_size);
- unsigned int *freq_index = (unsigned int *) StringValuePtr(_freq_index);
-
- unsigned int mask = 3;
- unsigned int i = 0;
- unsigned int c = 0;
- double p_freq = 1.0;
-
- for (i = 0; i < kmer_size; i++)
- {
- p_freq *= (double) freq_index[bin & mask] / freq_tot;
-
- bin >>= 2;
- }
-
- return DBL2NUM(p_freq);
- }
- }
-
- builder.c %{
- VALUE count_C(
- VALUE _bin,
- VALUE _kmer_index
- )
- {
- unsigned int bin = NUM2UINT(_bin);
- unsigned int *kmer_index = (unsigned int *) StringValuePtr(_kmer_index);
-
- unsigned int count = kmer_index[bin];
-
- return UINT2NUM(count);
- }
- }
- end
-end
-
-casts = []
-casts << {:long=>'size', :short=>'s', :type=>'uint', :mandatory=>true, :default=>10, :allowed=>nil, :disallowed=>"0"}
-casts << {:long=>'min_ratio', :short=>'m', :type=>'float', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil}
-
-options = Biopieces.options_parse(ARGV, casts)
-
-kmer_stats = KmerStats.new(options[:size])
-
-Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
- input.each_record do |record|
- output.puts record
-
- if record[:SEQ]
- kmer_stats.add(record[:SEQ])
- end
- end
-
- kmer_stats.each do |bin|
- p_kmer = kmer_stats.p_kmer(bin)
-
- next if p_kmer == 0.0
-
- p_freq = kmer_stats.p_freq(bin)
-
- ratio = (p_kmer / p_freq).round(2)
-
- if ratio >= options[:min_ratio]
- kmer = kmer_stats.kmer(bin)
- count = kmer_stats.count(bin)
-
- record = {
- :REC_TYPE => "KMER_STATS",
- :P_KMER => p_kmer,
- :KMER => kmer,
- :COUNT => count,
- :RATIO => ratio
- }
-
- output.puts record
- end
- end
-end
-
-# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-
-
-__END__