X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;ds=sidebyside;f=code_perl%2FMaasha%2FFastq.pm;h=ced2f0c6ca0035e9411863a0eb7626d69f6403cd;hb=9b9fbedec855fca622626c4bf479b40b8492172f;hp=cc895203672dc241ffe90773994508e4b73c178c;hpb=124abb0f4936b79a724556464ab8b8ae770bdb30;p=biopieces.git diff --git a/code_perl/Maasha/Fastq.pm b/code_perl/Maasha/Fastq.pm index cc89520..ced2f0c 100644 --- a/code_perl/Maasha/Fastq.pm +++ b/code_perl/Maasha/Fastq.pm @@ -46,15 +46,39 @@ use constant { use Inline ( C => <<'END_C', DIRECTORY => $ENV{ "BP_TMP" } ); -int phred2score( char c ) +/* +About Phred or FASTQ format: + +This format is an adaption of the fasta format that contains quality scores. +However, the fastq format is not completely compatible with the fastq files +currently in existence, which is read by various applications (for example, +BioPerl). Because a larger dynamic range of quality scores is used, the +quality scores are encoded in ASCII as 64+score, instead of the standard +32+score. This method is used to avoid running into non-printable characters. + + +About Solexa or SCARF format: + +SCARF (Solexa compact ASCII read format)—This easy-to-parse text based format, +stores all the information for a single read in one line. Allowed values are +--numeric and --symbolic. --numeric outputs the quality values as a +space-separated string of numbers. --symbolic outputs the quality values as a +compact string of ASCII characters. Subtract 64 from the ASCII code to get the +corresponding quality values. Under the current numbering scheme, quality +values can theoretically be as low as -5, which has an ASCII encoding of 59=';'. + +See also: + +http://maq.sourceforge.net/fastq.shtml */ + + +int phred2dec( char c ) { /* Martin A. Hansen, July 2009 */ - /* Converts a FASTQ/phred score in octal (a char) to a decimal score, */ + /* Converts a Phred score in octal (a char) to a decimal score, */ /* which is returned. */ - /* http://maq.sourceforge.net/fastq.shtml */ - int score = 0; score = ( int ) c - 33; @@ -63,82 +87,210 @@ int phred2score( char c ) } -int solexa2score( char c ) +int solexa2dec( char c ) { /* Martin A. Hansen, July 2009 */ /* Converts a Solexa score in octal (a char) to a decimal score, */ /* which is returned. */ - /* http://maq.sourceforge.net/fastq.shtml */ - /* $Q = 10 * log(1 + 10 ** (ord($sq) - 64) / 10.0)) / log(10); */ - int score = 0; - int ascii = ( int ) c - 64; - score = 10 * log( 1 + pow( 10, ascii / 10 ) ) / log( 10 ); + score = ( int ) c - 64; return score; } -char score2phred( int score ) +// int solexa2dec( char c ) +// { +// /* Martin A. Hansen, July 2009 */ +// +// /* Converts a Solexa score in octal (a char) to a decimal score, */ +// /* which is returned. */ +// +// /* http://maq.sourceforge.net/fastq.shtml */ +// /* $Q = 10 * log(1 + 10 ** (ord($sq) - 64) / 10.0)) / log(10); */ +// +// int score = 0; +// int ascii = ( int ) c - 64; +// +// score = 10 * log( 1 + pow( 10, ascii / 10 ) ) / log( 10 ); +// +// // printf( "char: %c ascii: %d score: %d\n", c, ascii, score ); +// +// return score; +// } + + +char dec2phred( int score ) { /* Martin A. Hansen, July 2009 */ - /* Converts a decimal score in decimal to an octal score (a char), */ + /* Converts a decimal score to an octal score (a char) in the Phred range, */ /* which is returned. */ - /* http://maq.sourceforge.net/fastq.shtml */ + char c = 0; - char c; + c = ( char ) score + 33; + + return c; +} - if ( score <= 93 ) { - c = ( char ) score + 33; - } else { - c = ( char ) 93 + 33; - } -// printf( "score: %d char: %c\n", score, c ); +char dec2solexa( int score ) +{ + /* Martin A. Hansen, September 2009 */ + + /* Converts a decimal score to an octal score (a char) in the Solexa range, */ + /* which is returned. */ + + char c = 0; + + c = ( char ) score + 64; return c; } -void solexa2phred( char *scores ) +char solexa2phred( char score ) { - /* Martin A. Hansen, July 2009 */ + /* Martin A. Hansen, September 2009 */ - /* Convert Solexa scores to Phred scores for use with FASTQ format.*/ + /* Converts an octal score in the Solexa range to an octal score */ + /* in the Pred range, which is returned. */ - int i = 0; - int score = 0; - char c; + int dec = 0; + char c = 0; - for ( i = 0; i < strlen( scores ); i++ ) - { - score = solexa2score( scores[ i ] ); - c = score2phred( score ); + dec = solexa2dec( score ); + c = dec2phred( c ); + + return c; +} + + +char phred2solexa( char score ) +{ + /* Martin A. Hansen, September 2009 */ + + /* Converts an octal score in the Solexa range to an octal score */ + /* in the Pred range, which is returned. */ + + int dec = 0; + char c = 0; + + dec = phred2dec( score ); + c = dec2solexa( c ); + + return c; +} + + +void solexa_str2phred_str( char *scores ) +{ + /* Martin A. Hansen, September 2009 */ + + /* Converts a string of Solexa octal scores to Phred octal scores in-line. */ + + int i = 0; + + for ( i = 0; i < strlen( scores ); i++ ) { + scores[ i ] = solexa2phred( scores[ i ] ); + } +} + + +void phred_str2solexa_str( char *scores ) +{ + /* Martin A. Hansen, September 2009 */ + + /* Converts a string of Phred octal scores to Solexa octal scores in-line. */ + + int i = 0; + + for ( i = 0; i < strlen( scores ); i++ ) { + scores[ i ] = phred2solexa( scores[ i ] ); + } +} + + +double phred_str_mean( char *scores ) +{ + /* Martin A. Hansen, November 2009 */ + + /* Calculates the mean score as a float which is retuned. */ + + int len = 0; + int i = 0; + int sum = 0; + double mean = 0.0; + + len = strlen( scores ); -// printf( "scores[i]: %c score: %d char: %c\n", scores[ i ], score, c ); + for ( i = 0; i < len; i++ ) { + sum += phred2dec( scores[ i ] ); + } + + mean = ( double ) sum / ( double ) len; + + return mean; +} + + +double solexa_str_mean( char *scores ) +{ + /* Martin A. Hansen, November 2009 */ + + /* Calculates the mean score as a float which is retuned. */ - scores[ i ] = c; + int len = 0; + int i = 0; + int sum = 0; + double mean = 0.0; + + len = strlen( scores ); + + for ( i = 0; i < len; i++ ) { + sum += solexa2dec( scores[ i ] ); } + + mean = ( double ) sum / ( double ) len; + + return mean; } -void lowercase_low_scores( char *seq, char *scores, int threshold ) +void softmask_solexa_str( char *seq, char *scores, int threshold ) { /* Martin A. Hansen, July 2009 */ - /* Given a sequence string and a score string (in FASTQ/Phread range) */ + /* Given a sequence string and a score string (in Solexa range) */ /* lowercases all residues where the decimal score is below a given threshold. */ int i = 0; for ( i = 0; i < strlen( seq ); i++ ) { - if ( phred2score( scores[ i ] ) < threshold ) { + if ( solexa2dec( scores[ i ] ) < threshold ) { + seq[ i ] = tolower( seq[ i ] ); + } + } +} + + +void softmask_phred_str( char *seq, char *scores, int threshold ) +{ + /* Martin A. Hansen, July 2009 */ + + /* Given a sequence string and a score string (in Phred range) */ + /* lowercases all residues where the decimal score is below a given threshold. */ + + int i = 0; + + for ( i = 0; i < strlen( seq ); i++ ) + { + if ( phred2dec( scores[ i ] ) < threshold ) { seq[ i ] = tolower( seq[ i ] ); } } @@ -151,6 +303,60 @@ END_C # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +sub solexa_str2dec_str +{ + # Martin A. Hansen, September 2009 + + # Converts a string of Solexa octal scores to a ; separated + # string of decimal scores. + + my ( $scores, # Solexa scores + ) = @_; + + # Returns a string. + + $scores =~ s/(.)/ solexa2dec( $1 ) . ";"/eg; + + return $scores; +} + + +sub phred_str2dec_str +{ + # Martin A. Hansen, September 2009 + + # Converts a string of Phred octal scores to a ; separated + # string of decimal scores. + + my ( $scores, # Phred scores + ) = @_; + + # Returns a string. + + $scores =~ s/(.)/ phred2dec( $1 ) . ";"/eg; + + return $scores; +} + + +sub dec_str2solexa_str +{ + # Martin A. Hansen, May 2010. + + # Converts a ; separated string of decimal scores to a + # string of Solexa octal scores. + + my ( $scores, # Decimal score string + ) = @_; + + # Returns a string. + + $scores =~ s/(\d{1,2});?/dec2solexa( $1 )/eg; + + return $scores; +} + + sub get_entry { # Martin A. Hansen, July 2009.