From 43a3f6b695375536014eb9626dc1b18abbbe83bb Mon Sep 17 00:00:00 2001 From: martinahansen Date: Fri, 4 Sep 2009 14:32:01 +0000 Subject: [PATCH] disabled quality score conversion in FASTQ and Solexa git-svn-id: http://biopieces.googlecode.com/svn/trunk@650 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/bowtie_seq | 2 - bp_bin/read_fastq | 8 -- bp_bin/read_solexa | 10 --- bp_bin/read_tab | 5 +- code_perl/Maasha/Fastq.pm | 176 ++++++++++++++++++++++++++++++-------- code_perl/Maasha/Plot.pm | 10 +-- 6 files changed, 149 insertions(+), 62 deletions(-) diff --git a/bp_bin/bowtie_seq b/bp_bin/bowtie_seq index cffcd32..2b9254d 100755 --- a/bp_bin/bowtie_seq +++ b/bp_bin/bowtie_seq @@ -123,7 +123,6 @@ while ( $line = <$fh_in> ) chomp $line; @fields = split /\t/, $line; - $record = bowtie2biopiece( \@fields ); Maasha::Biopieces::put_record( $record, $out ); @@ -161,7 +160,6 @@ sub bowtie2biopiece $record->{ 'SCORES' } = $entry->[ 5 ]; $record->{ 'SCORE' } = $entry->[ 6 ] + 1; $record->{ 'DESCRIPTOR' } = $entry->[ 7 ]; - $record->{ 'SEQ_LEN' } = length $entry->[ 4 ]; $record->{ 'S_END' } = $record->{ 'S_BEG' } + $record->{ 'SEQ_LEN' } - 1; $record->{ 'SCORES' } =~ s/(.)/ord( $1 ) - 33 . ";"/ge; # http://maq.sourceforge.net/fastq.shtml diff --git a/bp_bin/read_fastq b/bp_bin/read_fastq index a5f82bc..262bfde 100755 --- a/bp_bin/read_fastq +++ b/bp_bin/read_fastq @@ -64,14 +64,6 @@ if ( $options->{ 'data_in' } ) { if ( $record = Maasha::Fastq::fastq2biopiece( $entry ) ) { - if ( not $options->{ 'skip_quality' } ) - { - Maasha::Fastq::lowercase_low_scores( $record->{ 'SEQ' }, $record->{ 'SCORES' }, $options->{ 'quality' } ); - - $record->{ 'SCORES' } =~ s/(.)/ord( $1 ) - 33 . ";"/ge; # http://maq.sourceforge.net/fastq.shtml - $record->{ 'SCORE_MEAN' } = sprintf( "%.2f", Maasha::Calc::mean( [ split /;/, $record->{ 'SCORES' } ] ) ); - } - Maasha::Biopieces::put_record( $record, $out ); } diff --git a/bp_bin/read_solexa b/bp_bin/read_solexa index c1b4f0b..0d93142 100755 --- a/bp_bin/read_solexa +++ b/bp_bin/read_solexa @@ -67,16 +67,6 @@ if ( $options->{ 'data_in' } ) { if ( $record = Maasha::Fastq::fastq2biopiece( $entry ) ) { - if ( not $options->{ 'skip_quality' } ) - { - $record->{ 'SCORES' } =~ s/(\d+) ?/Maasha::Fastq::score2phred( $1 )/ge if $options->{ 'format' } eq 'decimal'; - Maasha::Fastq::solexa2phred( $record->{ 'SCORES' } ); - Maasha::Fastq::lowercase_low_scores( $record->{ 'SEQ' }, $record->{ 'SCORES' }, $options->{ 'quality' } ); - - $record->{ 'SCORES' } =~ s/(.)/ord( $1 ) - 33 . ";"/ge; # http://maq.sourceforge.net/fastq.shtml - $record->{ 'SCORE_MEAN' } = sprintf( "%.2f", Maasha::Calc::mean( [ split /;/, $record->{ 'SCORES' } ] ) ); - } - Maasha::Biopieces::put_record( $record, $out ); } diff --git a/bp_bin/read_tab b/bp_bin/read_tab index f223523..a8dede9 100755 --- a/bp_bin/read_tab +++ b/bp_bin/read_tab @@ -28,6 +28,7 @@ use warnings; use strict; +use Data::Dumper; use Maasha::Filesys; use Maasha::Biopieces; @@ -64,6 +65,8 @@ if ( $options->{ 'data_in' } ) while ( $line = <$data_in> ) { + chomp $line; + if ( $skip ) { $skip--; @@ -81,8 +84,6 @@ if ( $options->{ 'data_in' } ) next; } - chomp $line; - undef $record; undef @fields2; diff --git a/code_perl/Maasha/Fastq.pm b/code_perl/Maasha/Fastq.pm index cc89520..3a8ed43 100644 --- a/code_perl/Maasha/Fastq.pm +++ b/code_perl/Maasha/Fastq.pm @@ -46,15 +46,39 @@ use constant { use Inline ( C => <<'END_C', DIRECTORY => $ENV{ "BP_TMP" } ); -int phred2score( char c ) +/* +About Phred or FASTQ format: + +This format is an adaption of the fasta format that contains quality scores. +However, the fastq format is not completely compatible with the fastq files +currently in existence, which is read by various applications (for example, +BioPerl). Because a larger dynamic range of quality scores is used, the +quality scores are encoded in ASCII as 64+score, instead of the standard +32+score. This method is used to avoid running into non-printable characters. + + +About Solexa or SCARF format: + +SCARF (Solexa compact ASCII read format)—This easy-to-parse text based format, +stores all the information for a single read in one line. Allowed values are +--numeric and --symbolic. --numeric outputs the quality values as a +space-separated string of numbers. --symbolic outputs the quality values as a +compact string of ASCII characters. Subtract 64 from the ASCII code to get the +corresponding quality values. Under the current numbering scheme, quality +values can theoretically be as low as -5, which has an ASCII encoding of 59=';'. + +See also: + +http://maq.sourceforge.net/fastq.shtml */ + + +int phred2dec( char c ) { /* Martin A. Hansen, July 2009 */ - /* Converts a FASTQ/phred score in octal (a char) to a decimal score, */ + /* Converts a Phred score in octal (a char) to a decimal score, */ /* which is returned. */ - /* http://maq.sourceforge.net/fastq.shtml */ - int score = 0; score = ( int ) c - 33; @@ -63,82 +87,164 @@ int phred2score( char c ) } -int solexa2score( char c ) +int solexa2dec( char c ) { /* Martin A. Hansen, July 2009 */ /* Converts a Solexa score in octal (a char) to a decimal score, */ /* which is returned. */ - /* http://maq.sourceforge.net/fastq.shtml */ - /* $Q = 10 * log(1 + 10 ** (ord($sq) - 64) / 10.0)) / log(10); */ - int score = 0; - int ascii = ( int ) c - 64; - score = 10 * log( 1 + pow( 10, ascii / 10 ) ) / log( 10 ); + score = ( int ) c - 64; return score; } -char score2phred( int score ) +// int solexa2dec( char c ) +// { +// /* Martin A. Hansen, July 2009 */ +// +// /* Converts a Solexa score in octal (a char) to a decimal score, */ +// /* which is returned. */ +// +// /* http://maq.sourceforge.net/fastq.shtml */ +// /* $Q = 10 * log(1 + 10 ** (ord($sq) - 64) / 10.0)) / log(10); */ +// +// int score = 0; +// int ascii = ( int ) c - 64; +// +// score = 10 * log( 1 + pow( 10, ascii / 10 ) ) / log( 10 ); +// +// // printf( "char: %c ascii: %d score: %d\n", c, ascii, score ); +// +// return score; +// } + + +char dec2phred( int score ) { /* Martin A. Hansen, July 2009 */ - /* Converts a decimal score in decimal to an octal score (a char), */ + /* Converts a decimal score to an octal score (a char) in the Phred range, */ /* which is returned. */ - /* http://maq.sourceforge.net/fastq.shtml */ + char c = 0; - char c; + c = ( char ) score + 33; + + return c; +} - if ( score <= 93 ) { - c = ( char ) score + 33; - } else { - c = ( char ) 93 + 33; - } -// printf( "score: %d char: %c\n", score, c ); +char dec2solexa( int score ) +{ + /* Martin A. Hansen, September 2009 */ + + /* Converts a decimal score to an octal score (a char) in the Solexa range, */ + /* which is returned. */ + + char c = 0; + + c = ( char ) score + 64; return c; } -void solexa2phred( char *scores ) +char solexa2phred( char score ) { - /* Martin A. Hansen, July 2009 */ + /* Martin A. Hansen, September 2009 */ - /* Convert Solexa scores to Phred scores for use with FASTQ format.*/ + /* Converts an octal score in the Solexa range to an octal score */ + /* in the Pred range, which is returned. */ - int i = 0; - int score = 0; - char c; + int dec = 0; + char c = 0; + + dec = solexa2dec( score ); + c = dec2phred( c ); + + return c; +} + + +char phred2solexa( char score ) +{ + /* Martin A. Hansen, September 2009 */ + + /* Converts an octal score in the Solexa range to an octal score */ + /* in the Pred range, which is returned. */ + + int dec = 0; + char c = 0; + + dec = phred2dec( score ); + c = dec2solexa( c ); + + return c; +} + + +void solexa_str2phred_str( char *scores ) +{ + /* Martin A. Hansen, September 2009 */ + + /* Converts a string of Solexa octal scores to Phred octal scores in-line. */ + + int i = 0; + + for ( i + 0; i < strlen( scores ); i++ ) { + scores[ i ] = solexa2phred( scores[ i ] ); + } +} - for ( i = 0; i < strlen( scores ); i++ ) - { - score = solexa2score( scores[ i ] ); - c = score2phred( score ); -// printf( "scores[i]: %c score: %d char: %c\n", scores[ i ], score, c ); +void phred_str2solexa_str( char *scores ) +{ + /* Martin A. Hansen, September 2009 */ + + /* Converts a string of Phred octal scores to Solexa octal scores in-line. */ - scores[ i ] = c; + int i = 0; + + for ( i + 0; i < strlen( scores ); i++ ) { + scores[ i ] = phred2solexa( scores[ i ] ); + } +} + + +void softmask_solexa_str( char *seq, char *scores, int threshold ) +{ + /* Martin A. Hansen, July 2009 */ + + /* Given a sequence string and a score string (in Solexa range) */ + /* lowercases all residues where the decimal score is below a given threshold. */ + + int i = 0; + + for ( i = 0; i < strlen( seq ); i++ ) + { + if ( solexa2dec( scores[ i ] ) < threshold ) { + seq[ i ] = tolower( seq[ i ] ); + } } } -void lowercase_low_scores( char *seq, char *scores, int threshold ) +void softmask_phred_str( char *seq, char *scores, int threshold ) { /* Martin A. Hansen, July 2009 */ - /* Given a sequence string and a score string (in FASTQ/Phread range) */ + /* Given a sequence string and a score string (in Phred range) */ /* lowercases all residues where the decimal score is below a given threshold. */ int i = 0; for ( i = 0; i < strlen( seq ); i++ ) { - if ( phred2score( scores[ i ] ) < threshold ) { + if ( phred2dec( scores[ i ] ) < threshold ) { seq[ i ] = tolower( seq[ i ] ); } } diff --git a/code_perl/Maasha/Plot.pm b/code_perl/Maasha/Plot.pm index 720b5e9..4981b62 100644 --- a/code_perl/Maasha/Plot.pm +++ b/code_perl/Maasha/Plot.pm @@ -66,7 +66,7 @@ sub lineplot_simple # Returns list. - my ( $tmp_file, $pid, $fh_in, $fh_out, $cmd, $i, $line, @lines, $xtic_space, @plot_cmd ); + my ( $tmp_file, $pid, $fh_in, $fh_out, $cmd, $i, $line, @lines, $xtic_space, @plot_cmd, $title ); $tmp_dir ||= $ENV{ 'BP_TMP' }; @@ -93,11 +93,12 @@ sub lineplot_simple print $fh_in "set logscale y\n" if $options->{ "logscale_y" }; print $fh_in "set grid\n" if not $options->{ "terminal" } eq "dumb"; print $fh_in "set autoscale\n"; - print $fh_in "unset key\n"; print $fh_in "set xtics border in scale 0 nomirror rotate by 90 offset character 0, 0, 0\n"; - for ( $i = 1; $i < scalar @{ $data->[ 0 ] } + 1; $i++ ) { - push @plot_cmd, qq("$tmp_file" using $i with lines ls 1); + for ( $i = 1; $i < scalar @{ $data->[ 0 ] } + 1; $i++ ) + { + $title = $options->{ 'keys' }->[ $i - 1 ] || $options->{ 'list' }; + push @plot_cmd, qq("$tmp_file" using $i with lines title "$title"); } print $fh_in "plot " . join( ", ", @plot_cmd ) . "\n"; @@ -158,7 +159,6 @@ sub histogram_simple print $fh_in "set style histogram title offset character 0, 0, 0\n"; print $fh_in "set style data histograms\n"; print $fh_in "set xtics border in scale 0 nomirror rotate by 90 offset character 0, 0, 0\n"; - print $fh_in "plot '-' using 2:xticlabels(1)\n"; for ( $i = 0; $i < @{ $data }; $i++ ) -- 2.39.2