From: martinahansen Date: Tue, 28 Jul 2009 14:00:24 +0000 (+0000) Subject: changed read_solexa X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=07ad2954f22907a77b58118273f6915c73e9dfa7;p=biopieces.git changed read_solexa git-svn-id: http://biopieces.googlecode.com/svn/trunk@605 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/bp_bin/read_fastq b/bp_bin/read_fastq index 8aeedf9..088f8d6 100755 --- a/bp_bin/read_fastq +++ b/bp_bin/read_fastq @@ -63,7 +63,10 @@ if ( $options->{ 'data_in' } ) { if ( $record = Maasha::Fastq::fastq2biopiece( $entry ) ) { - lowercase_low_scores( $record, $options->{ 'quality' } ); + Maasha::Fastq::lowercase_low_scores( $record->{ 'SEQ' }, $record->{ 'SCORES' }, $options->{ 'quality' } ); + + $record->{ 'SCORES' } =~ s/(.)/ord( $1 ) - 33 . ";"/ge; # http://maq.sourceforge.net/fastq.shtml + $record->{ 'SCORE_MEAN' } = sprintf( "%.2f", Maasha::Calc::mean( [ split /;/, $record->{ 'SCORES' } ] ) ); Maasha::Biopieces::put_record( $record, $out ); } @@ -83,37 +86,6 @@ Maasha::Biopieces::close_stream( $out ); # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -sub lowercase_low_scores -{ - # Martin A. Hansen, July 2009. - - # Lowercase residues in sequence that are below a given score threshold. - - # This one would be lovely to have written in C. - - my ( $record, # Biopiece record - $quality, # quality threshold - ) = @_; - - # Returns nothing. - - my ( @seqs, @scores, $i ); - - @seqs = split //, $record->{ 'SEQ' }; - @scores = split /;/, $record->{ 'SCORES' }; - - for ( $i = 0; $i < @scores; $i++ ) { - $seqs[ $i ] = lc $seqs[ $i ] if $scores[ $i ] < $quality; - } - - $record->{ 'SEQ' } = join "", @seqs; - $record->{ 'SCORES' } = join ";", @scores; -} - - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - BEGIN { Maasha::Biopieces::status_set(); diff --git a/bp_bin/read_solexa b/bp_bin/read_solexa index dc5d237..9bb251a 100755 --- a/bp_bin/read_solexa +++ b/bp_bin/read_solexa @@ -28,9 +28,11 @@ use warnings; use strict; +use Data::Dumper; use Maasha::Biopieces; use Maasha::Filesys; use Maasha::Solexa; +use Maasha::Fastq; # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @@ -60,31 +62,23 @@ if ( $options->{ 'data_in' } ) $num = 1; - if ( $options->{ "format" } eq "octal" ) + while ( $entry = Maasha::Fastq::get_entry( $data_in ) ) { - while ( $entry = Maasha::Solexa::solexa_get_entry_octal( $data_in ) ) + if ( $record = Maasha::Fastq::fastq2biopiece( $entry ) ) { - $record = Maasha::Solexa::solexa2biopiece( $entry, $options->{ "quality" } ); + $record->{ 'SCORES' } =~ s/(\d+) ?/Maasha::Fastq::score2phred( $1 )/ge if $options->{ 'format' } eq 'decimal'; + Maasha::Fastq::solexa2phred( $record->{ 'SCORES' } ); + Maasha::Fastq::lowercase_low_scores( $record->{ 'SEQ' }, $record->{ 'SCORES' }, $options->{ 'quality' } ); - Maasha::Biopieces::put_record( $record, $out ); - - last if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - } - else - { - while ( $entry = Maasha::Solexa::solexa_get_entry_decimal( $data_in ) ) - { - $record = Maasha::Solexa::solexa2biopiece( $entry, $options->{ "quality" } ); + $record->{ 'SCORES' } =~ s/(.)/ord( $1 ) - 33 . ";"/ge; # http://maq.sourceforge.net/fastq.shtml + $record->{ 'SCORE_MEAN' } = sprintf( "%.2f", Maasha::Calc::mean( [ split /;/, $record->{ 'SCORES' } ] ) ); Maasha::Biopieces::put_record( $record, $out ); - - last if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; } + + last if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; } close $data_in; diff --git a/code_perl/Maasha/Fastq.pm b/code_perl/Maasha/Fastq.pm index be3c87c..4782486 100644 --- a/code_perl/Maasha/Fastq.pm +++ b/code_perl/Maasha/Fastq.pm @@ -36,14 +36,117 @@ use Data::Dumper; use Maasha::Calc; use vars qw( @ISA @EXPORT ); -@ISA = qw( Exporter ); - use constant { SEQ_NAME => 0, SEQ => 1, SCORES => 2, }; +@ISA = qw( Exporter ); + +use Inline ( C => <<'END_C', DIRECTORY => $ENV{ "BP_TMP" } ); + +int phred2score( char c ) +{ + /* Martin A. Hansen, July 2009 */ + + /* Converts a FASTQ/phred score in octal (a char) to a decimal score, */ + /* which is returned. */ + + /* http://maq.sourceforge.net/fastq.shtml */ + + int score = 0; + + score = ( int ) c - 33; + + return score; +} + + +int solexa2score( char c ) +{ + /* Martin A. Hansen, July 2009 */ + + /* Converts a Solexa score in octal (a char) to a decimal score, */ + /* which is returned. */ + + /* http://maq.sourceforge.net/fastq.shtml */ + /* $Q = 10 * log(1 + 10 ** (ord($sq) - 64) / 10.0)) / log(10); */ + + int score = 0; + int ascii = ( int ) c - 64; + + score = 10 * log( 1 + pow( 10, ascii / 10 ) ) / log( 10 ); + + return score; +} + + +char score2phred( int score ) +{ + /* Martin A. Hansen, July 2009 */ + + /* Converts a decimal score in decimal to an octal score (a char), */ + /* which is returned. */ + + /* http://maq.sourceforge.net/fastq.shtml */ + + char c; + + if ( score <= 93 ) { + c = ( char ) score + 33; + } else { + c = ( char ) 93 + 33; + } + +// printf( "score: %d char: %c\n", score, c ); + + return c; +} + + +void lowercase_low_scores( char *seq, char *scores, int threshold ) +{ + /* Martin A. Hansen, July 2009 */ + + /* Given a sequence string and a score string (in FASTQ/Phread range) */ + /* lowercases all residues where the decimal score is below a given threshold. */ + + int i = 0; + + for ( i = 0; i < strlen( seq ); i++ ) + { + if ( phred2score( scores[ i ] ) < threshold ) { + seq[ i ] = tolower( seq[ i ] ); + } + } +} + + +void solexa2phred( char *scores ) +{ + /* Martin A. Hansen, July 2009 */ + + /* Convert Solexa scores to Phred scores for use with FASTQ format.*/ + + int i = 0; + int score = 0; + char c; + + for ( i = 0; i < strlen( scores ); i++ ) + { + score = solexa2score( scores[ i ] ); + c = score2phred( score ); + +// printf( "scores[i]: %c score: %d char: %c\n", scores[ i ], score, c ); + + scores[ i ] = c; + } +} + + +END_C + # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @@ -118,9 +221,6 @@ sub fastq2biopiece $record->{ 'SEQ_NAME' } = $entry->[ SEQ_NAME ]; $record->{ 'SCORES' } = $entry->[ SCORES ]; - $record->{ 'SCORES' } =~ s/(.)/ord( $1 ) - 33 . ";"/ge; # http://maq.sourceforge.net/fastq.shtml - $record->{ 'SCORE_MEAN' } = sprintf( "%.2f", Maasha::Calc::mean( [ split /;/, $record->{ 'SCORES' } ] ) ); - return wantarray ? %{ $record } : $record; } diff --git a/code_perl/Maasha/Solexa.pm b/code_perl/Maasha/Solexa.pm index a26b0ce..fa2764e 100644 --- a/code_perl/Maasha/Solexa.pm +++ b/code_perl/Maasha/Solexa.pm @@ -121,7 +121,7 @@ sub solexa_get_entry_decimal # Returns a list. - my ( $seq_header, $seq, $score_head, $score, @scores ); + my ( $seq_header, $seq, $score_head, $score ); $seq_header = <$fh>; $seq = <$fh>; @@ -135,11 +135,8 @@ sub solexa_get_entry_decimal chomp $score_header; chomp $score; - @scores = split / /, $score; - $seq_header =~ s/^@//; - - $score = join( ";", @scores ); + $score =~ s/ /;/g; return wantarray ? ( $seq_header, $seq, $score ) : [ $seq_header, $seq, $score ]; }