From a761931fd09d82e1924dd0b3031fdcdf07f0d9a8 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Fri, 29 Jun 2012 18:27:21 +0000 Subject: [PATCH] rewrote mean_scores to fix bug git-svn-id: http://biopieces.googlecode.com/svn/trunk@1851 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/mean_scores | 137 ++++++++++++++++++++-------------- bp_test/in/mean_scores.in | 38 +++------- bp_test/out/mean_scores.out.1 | 48 +++--------- bp_test/out/mean_scores.out.2 | 55 +++----------- bp_test/out/mean_scores.out.3 | 55 +++----------- bp_test/test/test_mean_scores | 6 +- code_ruby/lib/maasha/seq.rb | 3 +- 7 files changed, 127 insertions(+), 215 deletions(-) diff --git a/bp_bin/mean_scores b/bp_bin/mean_scores index bd47e5b..adc80bc 100755 --- a/bp_bin/mean_scores +++ b/bp_bin/mean_scores @@ -1,6 +1,6 @@ -#!/usr/bin/env perl +#!/usr/bin/env ruby -# Copyright (C) 2007-2010 Martin A. Hansen. +# Copyright (C) 2007-2012 Martin A. Hansen. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -18,72 +18,93 @@ # http://www.gnu.org/copyleft/gpl.html - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -# Calculate the mean or local mean of quality SCORES in the stream. - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# This program is part of the Biopieces framework (www.biopieces.org). -use warnings; -use strict; -use Maasha::Biopieces; -use Maasha::Fastq; +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +# Calculate the mean or local mean of quality SCORES in the stream. # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -my ( $options, $in, $out, $record, $mean, $pos ); - -$options = Maasha::Biopieces::parse_options( - [ - { long => 'local', short => 'l', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, - { long => 'window_size', short => 'w', type => 'uint', mandatory => 'no', default => 5, allowed => undef, disallowed => 0 }, - { long => 'min', short => 'm', type => 'float', mandatory => 'no', default => 15, allowed => undef, disallowed => undef }, - ] -); - -$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); -$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); - -while ( $record = Maasha::Biopieces::get_record( $in ) ) -{ - if ( $record->{ 'SCORES' } ) - { - if ( $options->{ "local" } ) +require 'maasha/biopieces' +require 'maasha/seq' +require 'inline' +require 'pp' + + +# Opening class Seq to add scores_mean_local(_C) methods. +class Seq + # Method to run a sliding window of a specified size across a Phred type + # scores string and calculate for each window the mean score and return + # the minimum mean score. + def scores_mean_local(window_size) + scores_mean_local_C(self.qual, self.length, SCORE_BASE, window_size) + end + + inline do |builder| + builder.c %{ + VALUE scores_mean_local_C( + VALUE _qual, + VALUE _qual_len, + VALUE _score_base, + VALUE _window_size + ) + { + unsigned char *qual = (unsigned char *) StringValuePtr(_qual); + unsigned int qual_len = FIX2UINT(_qual_len); + unsigned int score_base = FIX2UINT(_score_base); + unsigned int window_size = FIX2UINT(_window_size); + unsigned int sum = 0; + unsigned int i = 0; + float mean = 0.0; + + // fill window + for (i = 0; i < window_size; i++) + sum += qual[i] - score_base; + + mean = sum / window_size; + + // run window across the rest of the scores + while (i < qual_len) { - ( $mean, $pos ) = Maasha::Fastq::solexa_str_mean_window( $record->{ 'SCORES' }, $options->{ 'window_size' }, $options->{ 'min' } ); - $record->{ 'SCORES_LOCAL_POS' } = $pos; - $record->{ 'SCORES_LOCAL_MEAN' } = sprintf( "%.2f", $mean); - } - else - { - $record->{ 'SCORES_MEAN' } = sprintf( "%.2f", Maasha::Fastq::solexa_str_mean( $record->{ 'SCORES' } ) ); - } - } - - Maasha::Biopieces::put_record( $record, $out ); -} - -Maasha::Biopieces::close_stream( $in ); -Maasha::Biopieces::close_stream( $out ); - - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -BEGIN -{ - Maasha::Biopieces::status_set(); -} + sum += qual[i] - score_base; + sum -= qual[i - window_size] - score_base; + mean = ( mean < sum / window_size ) ? mean : sum / window_size; + + i++; + } -END -{ - Maasha::Biopieces::status_log(); -} + return rb_float_new(mean); + } + } + end +end + +casts = [] +casts << {:long=>'local', :short=>'l', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'window_size', :short=>'w', :type=>'uint', :mandatory=>false, :default=>5, :allowed=>nil, :disallowed=>'0'} +casts << {:long=>'min', :short=>'m', :type=>'float', :mandatory=>false, :default=>15, :allowed=>nil, :disallowed=>nil} + +options = Biopieces.options_parse(ARGV, casts) + +Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| + input.each_record do |record| + if record[:SEQ] and record[:SCORES] + entry = Seq.new_bp(record) + + if options[:local] + record[:SCORES_MEAN_LOCAL] = entry.scores_mean_local(options[:window_size]).round(2) + else + record[:SCORES_MEAN] = entry.scores_mean.round(2) + end + end + + output.puts record + end +end # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< diff --git a/bp_test/in/mean_scores.in b/bp_test/in/mean_scores.in index 62229f8..b68bba0 100644 --- a/bp_test/in/mean_scores.in +++ b/bp_test/in/mean_scores.in @@ -1,35 +1,15 @@ SEQ_NAME: test1 -SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -SEQ_LEN: 39 -SCORES: hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh +SEQ: AAAAAAAAAAAAAAAAAAAA +SEQ_LEN: 20 +SCORES: hhhhhhhhhhhhhhhhhhhh --- SEQ_NAME: test2 -SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -SEQ_LEN: 39 -SCORES: hhhhBBBBBhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh +SEQ: AAAAAAAAAAAAAAAAAAAA +SEQ_LEN: 20 +SCORES: @@@@@hhhhhhhhhhhhhhh --- SEQ_NAME: test3 -SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -SEQ_LEN: 39 -SCORES: hhhhBBBBBBBBBBhhhhhhhhhhhhhhhhhhhhhhhhh ---- -SEQ_NAME: test4 -SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -SEQ_LEN: 39 -SCORES: hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhBBBBB ---- -SEQ_NAME: test5 -SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -SEQ_LEN: 39 -SCORES: hhhhhhhhhhhhhhhhhhhhhhhhhhhhhBBBBBBBBBB ---- -SEQ_NAME: test6 -SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -SEQ_LEN: 39 -SCORES: BBBBBhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh ---- -SEQ_NAME: test7 -SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -SEQ_LEN: 39 -SCORES: BBBBBBBBBBhhhhhhhhhhhhhhhhhhhhhhhhhhhhh +SEQ: AAAAAAAAAAAAAAAAAAAA +SEQ_LEN: 20 +SCORES: hhhhhhhhhhhhhhh@@@@@ --- diff --git a/bp_test/out/mean_scores.out.1 b/bp_test/out/mean_scores.out.1 index 85013fb..affde14 100644 --- a/bp_test/out/mean_scores.out.1 +++ b/bp_test/out/mean_scores.out.1 @@ -1,42 +1,18 @@ -SCORES: hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh -SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -SEQ_LEN: 39 SEQ_NAME: test1 -SCORES_MEAN: 40.00 +SEQ: AAAAAAAAAAAAAAAAAAAA +SEQ_LEN: 20 +SCORES: hhhhhhhhhhhhhhhhhhhh +SCORES_MEAN: 40.0 --- -SCORES: hhhhBBBBBhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh -SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -SEQ_LEN: 39 SEQ_NAME: test2 -SCORES_MEAN: 35.13 +SEQ: AAAAAAAAAAAAAAAAAAAA +SEQ_LEN: 20 +SCORES: @@@@@hhhhhhhhhhhhhhh +SCORES_MEAN: 30.0 --- -SCORES: hhhhBBBBBBBBBBhhhhhhhhhhhhhhhhhhhhhhhhh -SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -SEQ_LEN: 39 SEQ_NAME: test3 -SCORES_MEAN: 30.26 ---- -SCORES: hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhBBBBB -SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -SEQ_LEN: 39 -SEQ_NAME: test4 -SCORES_MEAN: 35.13 ---- -SCORES: hhhhhhhhhhhhhhhhhhhhhhhhhhhhhBBBBBBBBBB -SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -SEQ_LEN: 39 -SEQ_NAME: test5 -SCORES_MEAN: 30.26 ---- -SCORES: BBBBBhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh -SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -SEQ_LEN: 39 -SEQ_NAME: test6 -SCORES_MEAN: 35.13 ---- -SCORES: BBBBBBBBBBhhhhhhhhhhhhhhhhhhhhhhhhhhhhh -SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -SEQ_LEN: 39 -SEQ_NAME: test7 -SCORES_MEAN: 30.26 +SEQ: AAAAAAAAAAAAAAAAAAAA +SEQ_LEN: 20 +SCORES: hhhhhhhhhhhhhhh@@@@@ +SCORES_MEAN: 30.0 --- diff --git a/bp_test/out/mean_scores.out.2 b/bp_test/out/mean_scores.out.2 index daace59..bd5ab60 100644 --- a/bp_test/out/mean_scores.out.2 +++ b/bp_test/out/mean_scores.out.2 @@ -1,49 +1,18 @@ -SCORES: hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh -SCORES_LOCAL_MEAN: 40.00 -SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -SCORES_LOCAL_POS: -1 -SEQ_LEN: 39 SEQ_NAME: test1 +SEQ: AAAAAAAAAAAAAAAAAAAA +SEQ_LEN: 20 +SCORES: hhhhhhhhhhhhhhhhhhhh +SCORES_MEAN_LOCAL: 40.0 --- -SCORES: hhhhBBBBBhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh -SCORES_LOCAL_MEAN: 9.00 -SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -SCORES_LOCAL_POS: 3 -SEQ_LEN: 39 SEQ_NAME: test2 +SEQ: AAAAAAAAAAAAAAAAAAAA +SEQ_LEN: 20 +SCORES: @@@@@hhhhhhhhhhhhhhh +SCORES_MEAN_LOCAL: 0.0 --- -SCORES: hhhhBBBBBBBBBBhhhhhhhhhhhhhhhhhhhhhhhhh -SCORES_LOCAL_MEAN: 9.00 -SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -SCORES_LOCAL_POS: 3 -SEQ_LEN: 39 SEQ_NAME: test3 ---- -SCORES: hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhBBBBB -SCORES_LOCAL_MEAN: 9.00 -SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -SCORES_LOCAL_POS: 33 -SEQ_LEN: 39 -SEQ_NAME: test4 ---- -SCORES: hhhhhhhhhhhhhhhhhhhhhhhhhhhhhBBBBBBBBBB -SCORES_LOCAL_MEAN: 9.00 -SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -SCORES_LOCAL_POS: 28 -SEQ_LEN: 39 -SEQ_NAME: test5 ---- -SCORES: BBBBBhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh -SCORES_LOCAL_MEAN: 2.00 -SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -SCORES_LOCAL_POS: 0 -SEQ_LEN: 39 -SEQ_NAME: test6 ---- -SCORES: BBBBBBBBBBhhhhhhhhhhhhhhhhhhhhhhhhhhhhh -SCORES_LOCAL_MEAN: 2.00 -SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -SCORES_LOCAL_POS: 0 -SEQ_LEN: 39 -SEQ_NAME: test7 +SEQ: AAAAAAAAAAAAAAAAAAAA +SEQ_LEN: 20 +SCORES: hhhhhhhhhhhhhhh@@@@@ +SCORES_MEAN_LOCAL: 0.0 --- diff --git a/bp_test/out/mean_scores.out.3 b/bp_test/out/mean_scores.out.3 index 7ecb05b..57fb50b 100644 --- a/bp_test/out/mean_scores.out.3 +++ b/bp_test/out/mean_scores.out.3 @@ -1,49 +1,18 @@ -SCORES: hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh -SCORES_LOCAL_MEAN: 40.00 -SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -SCORES_LOCAL_POS: -1 -SEQ_LEN: 39 SEQ_NAME: test1 +SEQ: AAAAAAAAAAAAAAAAAAAA +SEQ_LEN: 20 +SCORES: hhhhhhhhhhhhhhhhhhhh +SCORES_MEAN_LOCAL: 40.0 --- -SCORES: hhhhBBBBBhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh -SCORES_LOCAL_MEAN: 2.00 -SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -SCORES_LOCAL_POS: 4 -SEQ_LEN: 39 SEQ_NAME: test2 +SEQ: AAAAAAAAAAAAAAAAAAAA +SEQ_LEN: 20 +SCORES: @@@@@hhhhhhhhhhhhhhh +SCORES_MEAN_LOCAL: 20.0 --- -SCORES: hhhhBBBBBBBBBBhhhhhhhhhhhhhhhhhhhhhhhhh -SCORES_LOCAL_MEAN: 2.00 -SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -SCORES_LOCAL_POS: 4 -SEQ_LEN: 39 SEQ_NAME: test3 ---- -SCORES: hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhBBBBB -SCORES_LOCAL_MEAN: 2.00 -SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -SCORES_LOCAL_POS: 34 -SEQ_LEN: 39 -SEQ_NAME: test4 ---- -SCORES: hhhhhhhhhhhhhhhhhhhhhhhhhhhhhBBBBBBBBBB -SCORES_LOCAL_MEAN: 2.00 -SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -SCORES_LOCAL_POS: 29 -SEQ_LEN: 39 -SEQ_NAME: test5 ---- -SCORES: BBBBBhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh -SCORES_LOCAL_MEAN: 2.00 -SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -SCORES_LOCAL_POS: 0 -SEQ_LEN: 39 -SEQ_NAME: test6 ---- -SCORES: BBBBBBBBBBhhhhhhhhhhhhhhhhhhhhhhhhhhhhh -SCORES_LOCAL_MEAN: 2.00 -SEQ: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -SCORES_LOCAL_POS: 0 -SEQ_LEN: 39 -SEQ_NAME: test7 +SEQ: AAAAAAAAAAAAAAAAAAAA +SEQ_LEN: 20 +SCORES: hhhhhhhhhhhhhhh@@@@@ +SCORES_MEAN_LOCAL: 20.0 --- diff --git a/bp_test/test/test_mean_scores b/bp_test/test/test_mean_scores index 6ff413f..ee24d3e 100755 --- a/bp_test/test/test_mean_scores +++ b/bp_test/test/test_mean_scores @@ -10,10 +10,6 @@ run "$bp -I $in -l -O $tmp" assert_no_diff $tmp $out.2 clean -run "$bp -I $in -l -m 2 -O $tmp" +run "$bp -I $in -l -w 10 -O $tmp" assert_no_diff $tmp $out.3 clean - -run "$bp -I $in -l -m 2 -w 10 -O $tmp" -assert_no_diff $tmp $out.4 -clean diff --git a/code_ruby/lib/maasha/seq.rb b/code_ruby/lib/maasha/seq.rb index aac8d5e..55c9c1f 100644 --- a/code_ruby/lib/maasha/seq.rb +++ b/code_ruby/lib/maasha/seq.rb @@ -505,7 +505,8 @@ class Seq raise SeqError, "Missing qual in entry" if self.qual.nil? na_qual = NArray.to_na(self.qual, "byte") - (na_qual - SCORE_BASE).mean + na_qual -= SCORE_BASE + na_qual.mean end end -- 2.39.5