#!/usr/bin/env ruby # Copyright (C) 2007-2012 Martin A. Hansen. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # http://www.gnu.org/copyleft/gpl.html # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # This program is part of the Biopieces framework (www.biopieces.org). # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # Replace values to a specied key for each record in the stream. # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< require 'maasha/biopieces' require 'maasha/seq' require 'inline' # Class containing methods to locate over-represented kmers in sequences. class KmerStats NUC_ALPH_SIZE = 4 BYTES_IN_INT = 4 # Method to initialize a KmerStats object. def initialize(kmer_size) @kmer_size = kmer_size raise ArgumentError, "kmer_size: #{kmer_size} > 16" if kmer_size > 16 @kmer_tot = 0 @freq_tot = 0 @kmer_index_size = NUC_ALPH_SIZE ** kmer_size @freq_index_size = NUC_ALPH_SIZE @kmer_index = "\0" * @kmer_index_size * BYTES_IN_INT @freq_index = "\0" * @freq_index_size * BYTES_IN_INT end # Method to add a sequence to the KmerStats. def add(seq) @kmer_tot += add_C(seq, seq.length, @kmer_size, @kmer_index, @freq_index) @freq_tot += seq.length end # Method for iterating the KmerStats and return. # each kmer as a binary value. def each (0 ... @kmer_index_size).each do |bin| yield bin end end # Method to determine the probability of an observed binary kmer. def p_kmer(bin) p_kmer_C(bin, @kmer_tot, @kmer_index) end # Method to determine the probablity of a theoretical binary kmer. def p_freq(bin) p_freq_C(bin, @freq_tot, @kmer_size, @freq_index) end # Method that converts a binary kmer to DNA. def kmer(bin) dna = "" mask = 3 (0 ... @kmer_size).each do case bin & mask when 1 then dna << "T" when 2 then dna << "C" when 3 then dna << "G" else dna << "A" end bin >>= 2 end dna.reverse end # Method that returns the count of a given binary kmer. def count(bin) count_C(bin, @kmer_index) end private inline do |builder| builder.prefix %{ unsigned char nuc2bin[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; } builder.c %{ VALUE add_C( VALUE _seq, VALUE _seq_len, VALUE _kmer_size, VALUE _kmer_index, VALUE _freq_index ) { unsigned char *seq = (unsigned char *) StringValuePtr(_seq); unsigned int seq_len = NUM2UINT(_seq_len); unsigned int kmer_size = NUM2UINT(_kmer_size); unsigned int *kmer_index = (unsigned int *) StringValuePtr(_kmer_index); unsigned int *freq_index = (unsigned int *) StringValuePtr(_freq_index); unsigned int mask = (1 << (2 * kmer_size)) - 1; unsigned int bin = 0; unsigned int val = 0; unsigned int i = 0; for (i = 0; i < kmer_size; i++) { bin <<= 2; val = nuc2bin[seq[i]]; bin |= val; freq_index[val]++; } kmer_index[bin]++; while (i < seq_len) { bin <<= 2; val = nuc2bin[seq[i]]; bin |= val; kmer_index[bin & mask]++; freq_index[val]++; i++; } return UINT2NUM(i - kmer_size + 1); } } builder.c %{ VALUE freq_count_C( VALUE _seq, VALUE _seq_len, VALUE _index ) { unsigned char *seq = (unsigned char *) StringValuePtr(_seq); unsigned int seq_len = NUM2UINT(_seq_len); unsigned int *index = (unsigned int *) StringValuePtr(_index); unsigned int i = 0; for (i = 0; i < seq_len; i++) { index[seq[i]]++; } return UINT2NUM(i); } } builder.c %{ VALUE p_kmer_C( VALUE _bin, VALUE _kmer_tot, VALUE _kmer_index ) { unsigned int bin = NUM2UINT(_bin); unsigned int kmer_tot = NUM2UINT(_kmer_tot); unsigned int *kmer_index = (unsigned int *) StringValuePtr(_kmer_index); double p_kmer = 0.0; p_kmer = (double) kmer_index[bin] / kmer_tot; return DBL2NUM(p_kmer); } } builder.c %{ VALUE p_freq_C( VALUE _bin, VALUE _freq_tot, VALUE _kmer_size, VALUE _freq_index ) { unsigned int bin = NUM2UINT(_bin); unsigned int freq_tot = NUM2UINT(_freq_tot); unsigned int kmer_size = NUM2UINT(_kmer_size); unsigned int *freq_index = (unsigned int *) StringValuePtr(_freq_index); unsigned int mask = 3; unsigned int i = 0; unsigned int c = 0; double p_freq = 1.0; for (i = 0; i < kmer_size; i++) { p_freq *= (double) freq_index[bin & mask] / freq_tot; bin >>= 2; } return DBL2NUM(p_freq); } } builder.c %{ VALUE count_C( VALUE _bin, VALUE _kmer_index ) { unsigned int bin = NUM2UINT(_bin); unsigned int *kmer_index = (unsigned int *) StringValuePtr(_kmer_index); unsigned int count = kmer_index[bin]; return UINT2NUM(count); } } end end casts = [] casts << {:long=>'size', :short=>'s', :type=>'uint', :mandatory=>true, :default=>10, :allowed=>nil, :disallowed=>"0"} casts << {:long=>'min_ratio', :short=>'m', :type=>'float', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil} options = Biopieces.options_parse(ARGV, casts) kmer_stats = KmerStats.new(options[:size]) Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| input.each_record do |record| output.puts record if record[:SEQ] kmer_stats.add(record[:SEQ]) end end kmer_stats.each do |bin| p_kmer = kmer_stats.p_kmer(bin) next if p_kmer == 0.0 p_freq = kmer_stats.p_freq(bin) ratio = (p_kmer / p_freq).round(2) if ratio >= options[:min_ratio] kmer = kmer_stats.kmer(bin) count = kmer_stats.count(bin) record = { :REC_TYPE => "KMER_STATS", :P_KMER => p_kmer, :KMER => kmer, :COUNT => count, :RATIO => ratio } output.puts record end end end # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< __END__