3 # Copyright (C) 2006-2009 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Routines for manipulation of FASTQ files and FASTQ entries.
27 # http://maq.sourceforge.net/fastq.shtml
30 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
37 use vars qw( @ISA @EXPORT );
45 @ISA = qw( Exporter );
47 use Inline ( C => <<'END_C', DIRECTORY => $ENV{ "BP_TMP" } );
50 About Phred or FASTQ format:
52 This format is an adaption of the fasta format that contains quality scores.
53 However, the fastq format is not completely compatible with the fastq files
54 currently in existence, which is read by various applications (for example,
55 BioPerl). Because a larger dynamic range of quality scores is used, the
56 quality scores are encoded in ASCII as 64+score, instead of the standard
57 32+score. This method is used to avoid running into non-printable characters.
60 About Solexa or SCARF format:
62 SCARF (Solexa compact ASCII read format)—This easy-to-parse text based format,
63 stores all the information for a single read in one line. Allowed values are
64 --numeric and --symbolic. --numeric outputs the quality values as a
65 space-separated string of numbers. --symbolic outputs the quality values as a
66 compact string of ASCII characters. Subtract 64 from the ASCII code to get the
67 corresponding quality values. Under the current numbering scheme, quality
68 values can theoretically be as low as -5, which has an ASCII encoding of 59=';'.
72 http://maq.sourceforge.net/fastq.shtml */
74 #define BASE_SOLEXA 64
78 int phred2dec( char c )
80 /* Martin A. Hansen, July 2009 */
82 /* Converts a Phred score in octal (a char) to a decimal score, */
83 /* which is returned. */
87 score = ( int ) c - BASE_PHRED;
93 int solexa2dec( char c )
95 /* Martin A. Hansen, July 2009 */
97 /* Converts a Solexa score in octal (a char) to a decimal score, */
98 /* which is returned. */
102 score = ( int ) c - BASE_SOLEXA;
108 char dec2phred( int score )
110 /* Martin A. Hansen, July 2009 */
112 /* Converts a decimal score to an octal score (a char) in the Phred range, */
113 /* which is returned. */
117 c = ( char ) score + BASE_PHRED;
123 char dec2solexa( int score )
125 /* Martin A. Hansen, September 2009 */
127 /* Converts a decimal score to an octal score (a char) in the Solexa range, */
128 /* which is returned. */
132 c = ( char ) score + BASE_SOLEXA;
138 char solexa2phred( char score )
140 /* Martin A. Hansen, September 2009 */
142 /* Converts an octal score in the Solexa range to an octal score */
143 /* in the Pred range, which is returned. */
148 dec = solexa2dec( score );
155 char phred2solexa( char score )
157 /* Martin A. Hansen, September 2009 */
159 /* Converts an octal score in the Solexa range to an octal score */
160 /* in the Pred range, which is returned. */
165 dec = phred2dec( score );
172 void solexa_str2phred_str( char *scores )
174 /* Martin A. Hansen, September 2009 */
176 /* Converts a string of Solexa octal scores to Phred octal scores in-line. */
180 for ( i = 0; i < strlen( scores ); i++ ) {
181 scores[ i ] = solexa2phred( scores[ i ] );
186 void phred_str2solexa_str( char *scores )
188 /* Martin A. Hansen, September 2009 */
190 /* Converts a string of Phred octal scores to Solexa octal scores in-line. */
194 for ( i = 0; i < strlen( scores ); i++ ) {
195 scores[ i ] = phred2solexa( scores[ i ] );
200 double phred_str_mean( char *scores )
202 /* Martin A. Hansen, November 2009 */
204 /* Calculates the mean score as a float which is retuned. */
211 len = strlen( scores );
213 for ( i = 0; i < len; i++ ) {
214 sum += phred2dec( scores[ i ] );
217 mean = ( double ) sum / ( double ) len;
223 double solexa_str_mean( char *scores )
225 /* Martin A. Hansen, November 2009 */
227 /* Calculates the mean score as a float which is retuned. */
234 len = strlen( scores );
236 for ( i = 0; i < len; i++ ) {
237 sum += solexa2dec( scores[ i ] );
240 mean = ( double ) sum / ( double ) len;
246 void solexa_str_mean_window( char *scores, int window_size, double min )
248 /* Martin A. Hansen, June 2010. */
250 /* Scans a score string by running a sliding window across */
251 /* the string and for each position calculate the mean score */
252 /* for the window. Terminates and returns mean score if this */
253 /* is lower than a given minimum otherwise the smallest mean */
254 /* score is returned. */
262 if ( window_size > strlen( scores ) )
264 fprintf( stderr, "ERROR: window_size > scores string: %d > %d\n\n", window_size, strlen(scores) );
268 /* ---- fill up window ---- */
270 for ( i = 0; i < window_size; i++ ) {
271 sum += solexa2dec( scores[ i ] );
274 mean = sum / window_size;
281 /* --- scan the rest of the scores ---- */
283 while ( ! found && i < strlen( scores ) )
285 sum += solexa2dec( scores[ i ] );
286 sum -= solexa2dec( scores[ i - window_size ] );
288 mean = ( mean < sum / window_size ) ? mean : sum / window_size;
290 // printf( "char->%c score->%d sum->%f mean->%f\n", scores[i], solexa2dec(scores[i]),sum, mean);
296 pos = i - window_size;
303 Inline_Stack_Push( sv_2mortal( newSViv( mean ) ) );
304 Inline_Stack_Push( sv_2mortal( newSViv( pos ) ) );
310 void softmask_solexa_str( char *seq, char *scores, int threshold )
312 /* Martin A. Hansen, July 2009 */
314 /* Given a sequence string and a score string (in Solexa range) */
315 /* lowercases all residues where the decimal score is below a given threshold. */
319 for ( i = 0; i < strlen( seq ); i++ )
321 if ( solexa2dec( scores[ i ] ) < threshold ) {
322 seq[ i ] = tolower( seq[ i ] );
328 void softmask_phred_str( char *seq, char *scores, int threshold )
330 /* Martin A. Hansen, July 2009 */
332 /* Given a sequence string and a score string (in Phred range) */
333 /* lowercases all residues where the decimal score is below a given threshold. */
337 for ( i = 0; i < strlen( seq ); i++ )
339 if ( phred2dec( scores[ i ] ) < threshold ) {
340 seq[ i ] = tolower( seq[ i ] );
346 int trim_left( char *scores, int min )
348 /* Martin A. Hansen, June 2010 */
350 /* Starting from the left in a score string, */
351 /* locate the position when the score is above */
356 while ( pos < strlen( scores ) && solexa2dec( scores[ pos ] ) <= min ) {
364 int trim_right( char *scores, int min )
366 /* Martin A. Hansen, June 2010 */
368 /* Starting from the right in a score string, */
369 /* locate the position when the score is above */
372 int len = strlen( scores );
375 while ( pos > 0 && solexa2dec( scores[ pos ] ) <= min ) {
389 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
392 sub solexa_str2dec_str
394 # Martin A. Hansen, September 2009
396 # Converts a string of Solexa octal scores to a ; separated
397 # string of decimal scores.
399 my ( $scores, # Solexa scores
404 $scores =~ s/(.)/ solexa2dec( $1 ) . ";"/eg;
410 sub phred_str2dec_str
412 # Martin A. Hansen, September 2009
414 # Converts a string of Phred octal scores to a ; separated
415 # string of decimal scores.
417 my ( $scores, # Phred scores
422 $scores =~ s/(.)/ phred2dec( $1 ) . ";"/eg;
428 sub dec_str2solexa_str
430 # Martin A. Hansen, May 2010.
432 # Converts a ; separated string of decimal scores to a
433 # string of Solexa octal scores.
435 my ( $scores, # Decimal score string
440 $scores =~ s/(-\d{1,2})/0/g;
441 $scores =~ s/(\d{1,2});?/dec2solexa( $1 )/eg;
449 # Martin A. Hansen, July 2009.
451 # Gets the next FASTQ entry from a given filehandle.
453 my ( $fh, # filehandle
458 my ( $seq, $seq_name, $qual, $qual_name );
474 return wantarray ? ( $seq_name, $seq, $qual ) : [ $seq_name, $seq, $qual ];
480 # Martin A. Hansen, July 2009.
482 # Output a FASTQ entry to STDOUT or a filehandle.
484 my ( $entry, # FASTQ entry
485 $fh, # filehandle - OPTIONAL
492 print $fh "@" . $entry->[ SEQ_NAME ] . "\n";
493 print $fh $entry->[ SEQ ] . "\n";
495 print $fh $entry->[ SCORES ] . "\n";
501 # Martin A. Hansen, July 2009.
503 # Converts a FASTQ entry to a Biopiece record, where
504 # the FASTQ quality scores are converted to numerics.
506 my ( $entry, # FASTQ entry,
513 $record->{ 'SEQ' } = $entry->[ SEQ ];
514 $record->{ 'SEQ_NAME' } = $entry->[ SEQ_NAME ];
515 $record->{ 'SCORES' } = $entry->[ SCORES ];
516 $record->{ 'SEQ_LEN' } = length $entry->[ SEQ ];
518 return wantarray ? %{ $record } : $record;
524 # Martin A. Hansen, July 2009.
526 # Converts a Biopiece record to a FASTQ entry.
528 my ( $record, # Biopiece record
535 if ( exists $record->{ 'SEQ' } and exists $record->{ 'SEQ_NAME' } and exists $record->{ 'SCORES' } )
537 $list->[ SEQ_NAME ] = $record->{ 'SEQ_NAME' };
538 $list->[ SEQ ] = $record->{ 'SEQ' };
539 $list->[ SCORES ] = $record->{ 'SCORES' };
541 $list->[ SCORES ] =~ s/(\d+);/chr( ( $1 <= 93 ? $1 : 93 ) + 33 )/ge;
543 return wantarray ? @{ $list } : $list;
550 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<