From f47f8b7621db2e9ee98c78ca0e4f7313535052c7 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Thu, 27 Sep 2012 12:58:17 +0000 Subject: [PATCH] clearning up kmer_stats and kmer_freq git-svn-id: http://biopieces.googlecode.com/svn/trunk@1945 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/kmer_freq | 32 +++-- bp_bin/kmer_stats | 306 ---------------------------------------------- 2 files changed, 19 insertions(+), 319 deletions(-) delete mode 100755 bp_bin/kmer_stats diff --git a/bp_bin/kmer_freq b/bp_bin/kmer_freq index 290f07d..9f45cef 100755 --- a/bp_bin/kmer_freq +++ b/bp_bin/kmer_freq @@ -28,34 +28,40 @@ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - require 'maasha/biopieces' require 'maasha/seq' casts = [] -casts << {:long=>'size', :short=>'s', :type=>'uint', :mandatory=>false, :default=>4, :allowed=>nil, :disallowed=>'0'} -casts << {:long=>'type', :short=>'t', :type=>'string', :mandatory=>false, :default=>"dna", :allowed=>"dna,rna,protein", :disallowed=>nil} +casts << {:long=>'size', :short=>'s', :type=>'uint', :mandatory=>false, :default=>4, :allowed=>nil, :disallowed=>'0'} -options = Biopieces.options_parse(ARGV, casts) +options = Biopieces.options_parse(ARGV, casts) -oligos = Seq.generate_oligos(options[:size], options[:type]) +size = options[:size] +kmer_hash = Hash.new { 0 } Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| input.each_record do |record| - if record.has_key? :SEQ - kmers = {} - oligos.each { |oligo| kmers[oligo.upcase] = 0 } + if record[:SEQ] + seq = record[:SEQ].upcase - (0 ... record[:SEQ].length - options[:size]).each do |i| - kmer = record[:SEQ][i .. i + options[:size] - 1].upcase - kmers[kmer] += 1 if kmers[kmer] + (0 .. seq.length - size).each do |i| + kmer = seq[i ... i + size] + kmer_hash[kmer] += 1 end - - record.merge! kmers end output.puts record end + + kmer_hash.each do |kmer, count| + record = { + :REC_TYPE => "KMER_FREQ", + :KMER => kmer, + :COUNT => count + } + + output.puts record + end end diff --git a/bp_bin/kmer_stats b/bp_bin/kmer_stats deleted file mode 100755 index 154cb8b..0000000 --- a/bp_bin/kmer_stats +++ /dev/null @@ -1,306 +0,0 @@ -#!/usr/bin/env ruby - -# Copyright (C) 2007-2012 Martin A. Hansen. - -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - -# http://www.gnu.org/copyleft/gpl.html - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -# This program is part of the Biopieces framework (www.biopieces.org). - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -# Replace values to a specied key for each record in the stream. - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -require 'maasha/biopieces' -require 'maasha/seq' -require 'inline' - -# Class containing methods to locate over-represented kmers in sequences. -class KmerStats - NUC_ALPH_SIZE = 4 - BYTES_IN_INT = 4 - - # Method to initialize a KmerStats object. - def initialize(kmer_size) - @kmer_size = kmer_size - - raise ArgumentError, "kmer_size: #{kmer_size} > 16" if kmer_size > 16 - - @kmer_tot = 0 - @freq_tot = 0 - @kmer_index_size = NUC_ALPH_SIZE ** kmer_size - @freq_index_size = NUC_ALPH_SIZE - @kmer_index = "\0" * @kmer_index_size * BYTES_IN_INT - @freq_index = "\0" * @freq_index_size * BYTES_IN_INT - end - - # Method to add a sequence to the KmerStats. - def add(seq) - @kmer_tot += add_C(seq, seq.length, @kmer_size, @kmer_index, @freq_index) - @freq_tot += seq.length - end - - # Method for iterating the KmerStats and return. - # each kmer as a binary value. - def each - (0 ... @kmer_index_size).each do |bin| - yield bin - end - end - - # Method to determine the probability of an observed binary kmer. - def p_kmer(bin) - p_kmer_C(bin, @kmer_tot, @kmer_index) - end - - # Method to determine the probablity of a theoretical binary kmer. - def p_freq(bin) - p_freq_C(bin, @freq_tot, @kmer_size, @freq_index) - end - - # Method that converts a binary kmer to DNA. - def kmer(bin) - dna = "" - mask = 3 - - (0 ... @kmer_size).each do - case bin & mask - when 1 then dna << "T" - when 2 then dna << "C" - when 3 then dna << "G" - else - dna << "A" - end - - bin >>= 2 - end - - dna.reverse - end - - # Method that returns the count of a given binary kmer. - def count(bin) - count_C(bin, @kmer_index) - end - - private - - inline do |builder| - builder.prefix %{ - unsigned char nuc2bin[256] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }; - } - - builder.c %{ - VALUE add_C( - VALUE _seq, - VALUE _seq_len, - VALUE _kmer_size, - VALUE _kmer_index, - VALUE _freq_index - ) - { - unsigned char *seq = (unsigned char *) StringValuePtr(_seq); - unsigned int seq_len = NUM2UINT(_seq_len); - unsigned int kmer_size = NUM2UINT(_kmer_size); - unsigned int *kmer_index = (unsigned int *) StringValuePtr(_kmer_index); - unsigned int *freq_index = (unsigned int *) StringValuePtr(_freq_index); - - unsigned int mask = (1 << (2 * kmer_size)) - 1; - unsigned int bin = 0; - unsigned int val = 0; - unsigned int i = 0; - - for (i = 0; i < kmer_size; i++) - { - bin <<= 2; - val = nuc2bin[seq[i]]; - bin |= val; - freq_index[val]++; - } - - kmer_index[bin]++; - - while (i < seq_len) - { - bin <<= 2; - val = nuc2bin[seq[i]]; - bin |= val; - - kmer_index[bin & mask]++; - freq_index[val]++; - - i++; - } - - return UINT2NUM(i - kmer_size + 1); - } - } - - builder.c %{ - VALUE freq_count_C( - VALUE _seq, - VALUE _seq_len, - VALUE _index - ) - { - unsigned char *seq = (unsigned char *) StringValuePtr(_seq); - unsigned int seq_len = NUM2UINT(_seq_len); - unsigned int *index = (unsigned int *) StringValuePtr(_index); - - unsigned int i = 0; - - for (i = 0; i < seq_len; i++) - { - index[seq[i]]++; - } - - return UINT2NUM(i); - } - } - - builder.c %{ - VALUE p_kmer_C( - VALUE _bin, - VALUE _kmer_tot, - VALUE _kmer_index - ) - { - unsigned int bin = NUM2UINT(_bin); - unsigned int kmer_tot = NUM2UINT(_kmer_tot); - unsigned int *kmer_index = (unsigned int *) StringValuePtr(_kmer_index); - - double p_kmer = 0.0; - - p_kmer = (double) kmer_index[bin] / kmer_tot; - - return DBL2NUM(p_kmer); - } - } - - builder.c %{ - VALUE p_freq_C( - VALUE _bin, - VALUE _freq_tot, - VALUE _kmer_size, - VALUE _freq_index - ) - { - unsigned int bin = NUM2UINT(_bin); - unsigned int freq_tot = NUM2UINT(_freq_tot); - unsigned int kmer_size = NUM2UINT(_kmer_size); - unsigned int *freq_index = (unsigned int *) StringValuePtr(_freq_index); - - unsigned int mask = 3; - unsigned int i = 0; - unsigned int c = 0; - double p_freq = 1.0; - - for (i = 0; i < kmer_size; i++) - { - p_freq *= (double) freq_index[bin & mask] / freq_tot; - - bin >>= 2; - } - - return DBL2NUM(p_freq); - } - } - - builder.c %{ - VALUE count_C( - VALUE _bin, - VALUE _kmer_index - ) - { - unsigned int bin = NUM2UINT(_bin); - unsigned int *kmer_index = (unsigned int *) StringValuePtr(_kmer_index); - - unsigned int count = kmer_index[bin]; - - return UINT2NUM(count); - } - } - end -end - -casts = [] -casts << {:long=>'size', :short=>'s', :type=>'uint', :mandatory=>true, :default=>10, :allowed=>nil, :disallowed=>"0"} -casts << {:long=>'min_ratio', :short=>'m', :type=>'float', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil} - -options = Biopieces.options_parse(ARGV, casts) - -kmer_stats = KmerStats.new(options[:size]) - -Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| - input.each_record do |record| - output.puts record - - if record[:SEQ] - kmer_stats.add(record[:SEQ]) - end - end - - kmer_stats.each do |bin| - p_kmer = kmer_stats.p_kmer(bin) - - next if p_kmer == 0.0 - - p_freq = kmer_stats.p_freq(bin) - - ratio = (p_kmer / p_freq).round(2) - - if ratio >= options[:min_ratio] - kmer = kmer_stats.kmer(bin) - count = kmer_stats.count(bin) - - record = { - :REC_TYPE => "KMER_STATS", - :P_KMER => p_kmer, - :KMER => kmer, - :COUNT => count, - :RATIO => ratio - } - - output.puts record - end - end -end - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -__END__ -- 2.39.5