3 # Copyright (C) 2007-2012 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
21 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
23 # This program is part of the Biopieces framework (www.biopieces.org).
25 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
27 # Replace values to a specied key for each record in the stream.
29 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
31 require 'maasha/biopieces'
35 # Class containing methods to locate over-represented kmers in sequences.
40 # Method to initialize a KmerStats object.
41 def initialize(kmer_size)
42 @kmer_size = kmer_size
44 raise ArgumentError, "kmer_size: #{kmer_size} > 16" if kmer_size > 16
48 @kmer_index_size = NUC_ALPH_SIZE ** kmer_size
49 @freq_index_size = NUC_ALPH_SIZE
50 @kmer_index = "\0" * @kmer_index_size * BYTES_IN_INT
51 @freq_index = "\0" * @freq_index_size * BYTES_IN_INT
54 # Method to add a sequence to the KmerStats.
56 @kmer_tot += add_C(seq, seq.length, @kmer_size, @kmer_index, @freq_index)
57 @freq_tot += seq.length
60 # Method for iterating the KmerStats and return.
61 # each kmer as a binary value.
63 (0 ... @kmer_index_size).each do |bin|
68 # Method to determine the probability of an observed binary kmer.
70 p_kmer_C(bin, @kmer_tot, @kmer_index)
73 # Method to determine the probablity of a theoretical binary kmer.
75 p_freq_C(bin, @freq_tot, @kmer_size, @freq_index)
78 # Method that converts a binary kmer to DNA.
83 (0 ... @kmer_size).each do
85 when 1 then dna << "T"
86 when 2 then dna << "C"
87 when 3 then dna << "G"
98 # Method that returns the count of a given binary kmer.
100 count_C(bin, @kmer_index)
107 unsigned char nuc2bin[256] = {
108 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
109 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
110 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
111 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
112 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
113 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
114 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
115 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
117 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
118 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
119 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
122 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
136 unsigned char *seq = (unsigned char *) StringValuePtr(_seq);
137 unsigned int seq_len = NUM2UINT(_seq_len);
138 unsigned int kmer_size = NUM2UINT(_kmer_size);
139 unsigned int *kmer_index = (unsigned int *) StringValuePtr(_kmer_index);
140 unsigned int *freq_index = (unsigned int *) StringValuePtr(_freq_index);
142 unsigned int mask = (1 << (2 * kmer_size)) - 1;
143 unsigned int bin = 0;
144 unsigned int val = 0;
147 for (i = 0; i < kmer_size; i++)
150 val = nuc2bin[seq[i]];
160 val = nuc2bin[seq[i]];
163 kmer_index[bin & mask]++;
169 return UINT2NUM(i - kmer_size + 1);
180 unsigned char *seq = (unsigned char *) StringValuePtr(_seq);
181 unsigned int seq_len = NUM2UINT(_seq_len);
182 unsigned int *index = (unsigned int *) StringValuePtr(_index);
186 for (i = 0; i < seq_len; i++)
202 unsigned int bin = NUM2UINT(_bin);
203 unsigned int kmer_tot = NUM2UINT(_kmer_tot);
204 unsigned int *kmer_index = (unsigned int *) StringValuePtr(_kmer_index);
208 p_kmer = (double) kmer_index[bin] / kmer_tot;
210 return DBL2NUM(p_kmer);
222 unsigned int bin = NUM2UINT(_bin);
223 unsigned int freq_tot = NUM2UINT(_freq_tot);
224 unsigned int kmer_size = NUM2UINT(_kmer_size);
225 unsigned int *freq_index = (unsigned int *) StringValuePtr(_freq_index);
227 unsigned int mask = 3;
232 for (i = 0; i < kmer_size; i++)
234 p_freq *= (double) freq_index[bin & mask] / freq_tot;
239 return DBL2NUM(p_freq);
249 unsigned int bin = NUM2UINT(_bin);
250 unsigned int *kmer_index = (unsigned int *) StringValuePtr(_kmer_index);
252 unsigned int count = kmer_index[bin];
254 return UINT2NUM(count);
261 casts << {:long=>'size', :short=>'s', :type=>'uint', :mandatory=>true, :default=>10, :allowed=>nil, :disallowed=>"0"}
262 casts << {:long=>'min_ratio', :short=>'m', :type=>'float', :mandatory=>false, :default=>10, :allowed=>nil, :disallowed=>nil}
264 options = Biopieces.options_parse(ARGV, casts)
266 kmer_stats = KmerStats.new(options[:size])
268 Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
269 input.each_record do |record|
273 kmer_stats.add(record[:SEQ])
277 kmer_stats.each do |bin|
278 p_kmer = kmer_stats.p_kmer(bin)
280 next if p_kmer == 0.0
282 p_freq = kmer_stats.p_freq(bin)
284 ratio = (p_kmer / p_freq).round(2)
286 if ratio >= options[:min_ratio]
287 kmer = kmer_stats.kmer(bin)
288 count = kmer_stats.count(bin)
291 :REC_TYPE => "KMER_STATS",
303 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<