From: martinahansen Date: Tue, 25 Sep 2012 07:58:07 +0000 (+0000) Subject: added kmer_stats biopiece X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=006ca05165de49293a39f735f55cf64b8d36c2b3;p=biopieces.git added kmer_stats biopiece git-svn-id: http://biopieces.googlecode.com/svn/trunk@1938 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/bp_bin/kmer_stats b/bp_bin/kmer_stats new file mode 100755 index 0000000..a14e38b --- /dev/null +++ b/bp_bin/kmer_stats @@ -0,0 +1,305 @@ +#!/usr/bin/env ruby + +# Copyright (C) 2007-2012 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This program is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Replace values to a specied key for each record in the stream. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +require 'maasha/biopieces' +require 'maasha/seq' +require 'inline' + +# Class containing methods to locate over-represented kmers in sequences. +class KmerStats + NUC_ALPH_SIZE = 4 + BYTES_IN_INT = 4 + + # Method to initialize a KmerStats object. + def initialize(kmer_size) + @kmer_size = kmer_size + + raise ArgumentError, "kmer_size: #{kmer_size} > 16" if kmer_size > 16 + + @kmer_tot = 0 + @freq_tot = 0 + @kmer_index_size = NUC_ALPH_SIZE ** kmer_size + @freq_index_size = NUC_ALPH_SIZE + @kmer_index = "\0" * @kmer_index_size * BYTES_IN_INT + @freq_index = "\0" * @freq_index_size * BYTES_IN_INT + end + + # Method to add a sequence to the KmerStats. + def add(seq) + @kmer_tot += add_C(seq, seq.length, @kmer_size, @kmer_index, @freq_index) + @freq_tot += seq.length + end + + # Method for iterating the KmerStats and return. + # each kmer as a binary value. + def each + (0 ... @kmer_index_size).each do |bin| + yield bin + end + end + + # Method to determine the probability of an observed binary kmer. + def p_kmer(bin) + p_kmer_C(bin, @kmer_tot, @kmer_index) + end + + # Method to determine the probablity of a theoretical binary kmer. + def p_freq(bin) + p_freq_C(bin, @freq_tot, @kmer_size, @freq_index) + end + + # Method that converts a binary kmer to DNA. + def kmer(bin) + dna = "" + mask = 3 + + (0 ... @kmer_size).each do + case bin & mask + when 1 then dna << "T" + when 2 then dna << "C" + when 3 then dna << "G" + else + dna << "A" + end + + bin >>= 2 + end + + dna.reverse + end + + # Method that returns the count of a given binary kmer. + def count(bin) + count_C(bin, @kmer_index) + end + + private + + inline do |builder| + builder.prefix %{ + unsigned char nuc2bin[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }; + } + + builder.c %{ + VALUE add_C( + VALUE _seq, + VALUE _seq_len, + VALUE _kmer_size, + VALUE _kmer_index, + VALUE _freq_index + ) + { + unsigned char *seq = (unsigned char *) StringValuePtr(_seq); + unsigned int seq_len = NUM2UINT(_seq_len); + unsigned int kmer_size = NUM2UINT(_kmer_size); + unsigned int *kmer_index = (unsigned int *) StringValuePtr(_kmer_index); + unsigned int *freq_index = (unsigned int *) StringValuePtr(_freq_index); + + unsigned int mask = (1 << (2 * kmer_size)) - 1; + unsigned int bin = 0; + unsigned int val = 0; + unsigned int i = 0; + + for (i = 0; i < kmer_size; i++) + { + bin <<= 2; + val = nuc2bin[seq[i]]; + bin |= val; + freq_index[val]++; + } + + kmer_index[bin]++; + + while (i < seq_len) + { + bin <<= 2; + val = nuc2bin[seq[i]]; + bin |= val; + + kmer_index[bin & mask]++; + freq_index[val]++; + + i++; + } + + return UINT2NUM(i - kmer_size + 1); + } + } + + builder.c %{ + VALUE freq_count_C( + VALUE _seq, + VALUE _seq_len, + VALUE _index + ) + { + unsigned char *seq = (unsigned char *) StringValuePtr(_seq); + unsigned int seq_len = NUM2UINT(_seq_len); + unsigned int *index = (unsigned int *) StringValuePtr(_index); + + unsigned int i = 0; + + for (i = 0; i < seq_len; i++) + { + index[seq[i]]++; + } + + return UINT2NUM(i); + } + } + + builder.c %{ + VALUE p_kmer_C( + VALUE _bin, + VALUE _kmer_tot, + VALUE _kmer_index + ) + { + unsigned int bin = NUM2UINT(_bin); + unsigned int kmer_tot = NUM2UINT(_kmer_tot); + unsigned int *kmer_index = (unsigned int *) StringValuePtr(_kmer_index); + + double p_kmer = 0.0; + + p_kmer = (double) kmer_index[bin] / kmer_tot; + + return DBL2NUM(p_kmer); + } + } + + builder.c %{ + VALUE p_freq_C( + VALUE _bin, + VALUE _freq_tot, + VALUE _kmer_size, + VALUE _freq_index + ) + { + unsigned int bin = NUM2UINT(_bin); + unsigned int freq_tot = NUM2UINT(_freq_tot); + unsigned int kmer_size = NUM2UINT(_kmer_size); + unsigned int *freq_index = (unsigned int *) StringValuePtr(_freq_index); + + unsigned int mask = 3; + unsigned int i = 0; + unsigned int c = 0; + double p_freq = 1.0; + + for (i = 0; i < kmer_size; i++) + { + p_freq *= (double) freq_index[bin & mask] / freq_tot; + + bin >>= 2; + } + + return DBL2NUM(p_freq); + } + } + + builder.c %{ + VALUE count_C( + VALUE _bin, + VALUE _kmer_index + ) + { + unsigned int bin = NUM2UINT(_bin); + unsigned int *kmer_index = (unsigned int *) StringValuePtr(_kmer_index); + + unsigned int count = kmer_index[bin]; + + return UINT2NUM(count); + } + } + end +end + +casts = [] +casts << {:long=>'size', :short=>'s', :type=>'uint', :mandatory=>true, :default=>10, :allowed=>nil, :disallowed=>"0"} +casts << {:long=>'min_ratio', :short=>'m', :type=>'float', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil} + +options = Biopieces.options_parse(ARGV, casts) + +kmer_stats = KmerStats.new(options[:size]) + +Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| + input.each_record do |record| + output.puts record + + if record[:SEQ] + kmer_stats.add(record[:SEQ]) + end + end + + kmer_stats.each do |bin| + p_kmer = kmer_stats.p_kmer(bin) + + next if p_kmer == 0.0 + + p_freq = kmer_stats.p_freq(bin) + + ratio = (p_kmer / p_freq).round(2) + + if ratio >= options[:min_ratio] + kmer = kmer_stats.kmer(bin) + count = kmer_stats.count(bin) + + record = { + :REC_TYPE => "KMER_STATS", + :KMER => kmer, + :COUNT => count, + :RATIO => ratio + } + + output.puts record + end + end +end + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__