From 966e4cd7386b6c3389da706198d0e070cf8a93f5 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Tue, 29 Jun 2010 12:34:24 +0000 Subject: [PATCH] added biopieces mean_scores git-svn-id: http://biopieces.googlecode.com/svn/trunk@1002 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/mean_scores | 94 +++++++++++++++++++++++++++++++++++++++ code_perl/Maasha/Fastq.pm | 86 ++++++++++++++++++++++++----------- 2 files changed, 155 insertions(+), 25 deletions(-) create mode 100755 bp_bin/mean_scores diff --git a/bp_bin/mean_scores b/bp_bin/mean_scores new file mode 100755 index 0000000..ab98dec --- /dev/null +++ b/bp_bin/mean_scores @@ -0,0 +1,94 @@ +#!/usr/bin/env perl + +# Copyright (C) 2007-2010 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Calculate the mean or local mean of quality SCORES in the stream. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use warnings; +use strict; +use Maasha::Biopieces; +use Maasha::Fastq; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $options, $in, $out, $record ); + +$options = Maasha::Biopieces::parse_options( + [ + { long => 'local', short => 'l', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'window_size', short => 'w', type => 'uint', mandatory => 'no', default => 5, allowed => undef, disallowed => 0 }, + { long => 'min', short => 'm', type => 'float', mandatory => 'no', default => 15, allowed => undef, disallowed => undef }, + ] +); + +$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); +$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); + +while ( $record = Maasha::Biopieces::get_record( $in ) ) +{ + if ( $record->{ 'SCORES' } ) + { + if ( $options->{ "local" } ) + { + $record->{ 'SCORES_LOCAL_MEAN' } = sprintf( "%.2f", Maasha::Fastq::solexa_str_mean_window( + $record->{ 'SCORES' }, + $options->{ 'window_size' }, + $options->{ 'min' } + ) ); + } + else + { + $record->{ 'SCORES_MEAN' } = sprintf( "%.2f", Maasha::Fastq::solexa_str_mean( $record->{ 'SCORES' } ) ); + } + } + + Maasha::Biopieces::put_record( $record, $out ); +} + +Maasha::Biopieces::close_stream( $in ); +Maasha::Biopieces::close_stream( $out ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +BEGIN +{ + Maasha::Biopieces::status_set(); +} + + +END +{ + Maasha::Biopieces::status_log(); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ diff --git a/code_perl/Maasha/Fastq.pm b/code_perl/Maasha/Fastq.pm index ced2f0c..b27017b 100644 --- a/code_perl/Maasha/Fastq.pm +++ b/code_perl/Maasha/Fastq.pm @@ -71,6 +71,9 @@ See also: http://maq.sourceforge.net/fastq.shtml */ +#define BASE_SOLEXA 64 +#define BASE_PHRED 33 + int phred2dec( char c ) { @@ -81,7 +84,7 @@ int phred2dec( char c ) int score = 0; - score = ( int ) c - 33; + score = ( int ) c - BASE_PHRED; return score; } @@ -96,33 +99,12 @@ int solexa2dec( char c ) int score = 0; - score = ( int ) c - 64; + score = ( int ) c - BASE_SOLEXA; return score; } -// int solexa2dec( char c ) -// { -// /* Martin A. Hansen, July 2009 */ -// -// /* Converts a Solexa score in octal (a char) to a decimal score, */ -// /* which is returned. */ -// -// /* http://maq.sourceforge.net/fastq.shtml */ -// /* $Q = 10 * log(1 + 10 ** (ord($sq) - 64) / 10.0)) / log(10); */ -// -// int score = 0; -// int ascii = ( int ) c - 64; -// -// score = 10 * log( 1 + pow( 10, ascii / 10 ) ) / log( 10 ); -// -// // printf( "char: %c ascii: %d score: %d\n", c, ascii, score ); -// -// return score; -// } - - char dec2phred( int score ) { /* Martin A. Hansen, July 2009 */ @@ -132,7 +114,7 @@ char dec2phred( int score ) char c = 0; - c = ( char ) score + 33; + c = ( char ) score + BASE_PHRED; return c; } @@ -147,7 +129,7 @@ char dec2solexa( int score ) char c = 0; - c = ( char ) score + 64; + c = ( char ) score + BASE_SOLEXA; return c; } @@ -261,6 +243,60 @@ double solexa_str_mean( char *scores ) } +double solexa_str_mean_window( char *scores, int window_size, double min ) +{ + /* Martin A. Hansen, June 2010. */ + + /* Scans a score string by running a sliding window across */ + /* the string and for each position calculate the mean score */ + /* for the window. Terminates and returns mean score if this */ + /* is lower than a given minimum otherwise the smallest mean */ + /* score is returned. */ + + int i = 0; + double sum = 0; + double mean = 0.0; + + if ( window_size > strlen( scores ) ) + { + fprintf( stderr, "ERROR: window_size > scores string: %d > %d\n\n", window_size, strlen(scores) ); + exit( 1 ); + } + + /* ---- fill up window ---- */ + + for ( i = 0; i < window_size; i++ ) { + sum += solexa2dec( scores[ i ] ); + } + + mean = sum / window_size; + + if ( mean < min ) { + return mean; + } + + /* --- scan the rest of the scores ---- */ + + while ( i < strlen( scores ) ) + { + sum += solexa2dec( scores[ i ] ); + sum -= solexa2dec( scores[ i - window_size ] ); + + mean = ( mean < sum / window_size ) ? mean : sum / window_size; + + // printf( "char->%c score->%d sum->%f mean->%f\n", scores[i], solexa2dec(scores[i]),sum, mean); + + if ( mean < min ) { + return mean; + } + + i++; + } + + return mean; +} + + void softmask_solexa_str( char *seq, char *scores, int threshold ) { /* Martin A. Hansen, July 2009 */ -- 2.39.2