1 package Maasha::Solexa;
4 # Copyright (C) 2007-2008 Martin A. Hansen.
6 # This program is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU General Public License
8 # as published by the Free Software Foundation; either version 2
9 # of the License, or (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 # http://www.gnu.org/copyleft/gpl.html
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
26 # Routines for manipulation Solid sequence files with di-base encoding.
29 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
35 use vars qw( @ISA @EXPORT_OK );
39 @ISA = qw( Exporter );
47 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
50 sub solexa_get_entry_octal
52 # Martin A. Hansen, August 2008
54 # Get the next Solexa entry form a file and returns
55 # a triple of [ seq_name, seq, score ].
56 # We asume a Solexa entry consists of four lines:
58 # @HWI-EAS157_20FFGAAXX:2:1:888:434
59 # TTGGTCGCTCGCTCCGCGACCTCAGATCAGACGTGG
60 # +HWI-EAS157_20FFGAAXX:2:1:888:434
61 # hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhKhe
63 my ( $fh, # filehandle to Solexa file
68 my ( $seq_header, $seq, $score_head, $score, @scores );
72 $score_header = <$fh>;
75 return if not defined $score;
82 @scores = split //, $score;
84 map { $_ = int( score_oct2dec( $_ ) ) } @scores;
86 $seq_header =~ s/^@//;
88 $score = join( ";", @scores );
90 return wantarray ? ( $seq_header, $seq, $score ) : [ $seq_header, $seq, $score ];
96 # Martin A. Hansen, August 2008
98 # Converts a Solexa score in octal format to decimal format.
100 # http://maq.sourceforge.net/fastq.shtml
102 my ( $char, # octal score
107 return 10 * log(1 + 10 ** ((ord( $char ) - 64) / 10.0)) / log(10);
111 sub solexa_get_entry_decimal
113 # Martin A. Hansen, August 2008
115 # Get the next Solexa entry form a file and returns
116 # a triple of [ seq_name, seq, score ].
117 # We asume a Solexa entry consists of four lines:
119 # @USI-EAS18_131_4_1_352_619
120 # ATGGATGGGTTGGAGATGCCCTCTGTAGGCACCAT
121 # +USI-EAS18_131_4_1_352_619
122 # 40 40 40 40 40 40 40 40 40 40 40 25 25 40 40 34 40 40 40 40 32 39 40 36 34 19 40 19 30 16 21 11 21 18 11
124 my ( $fh, # filehandle to Solexa file
129 my ( $seq_header, $seq, $score_head, $score, @scores );
133 $score_header = <$fh>;
136 return if not defined $score;
143 @scores = split / /, $score;
145 $seq_header =~ s/^@//;
147 $score = join( ";", @scores );
149 return wantarray ? ( $seq_header, $seq, $score ) : [ $seq_header, $seq, $score ];
155 # Martin A. Hansen, September 2008.
157 # Converts a Solexa entry to a Biopiece record.
159 my ( $entry, # Solexa entry
160 $quality, # Quality cutoff
165 my ( @seqs, @scores, $i, %record );
167 @seqs = split //, $entry->[ SEQ ];
168 @scores = split /;/, $entry->[ SCORE ];
170 for ( $i = 0; $i < @scores; $i++ ) {
171 $seqs[ $i ] = lc $seqs[ $i ] if $scores[ $i ] < $quality;
174 $record{ "SEQ_NAME" } = $entry->[ SEQ_NAME ];
175 $record{ "SEQ" } = $entry->[ SEQ ];
176 $record{ "SCORES" } = $entry->[ SCORE ];
177 $record{ "SEQ_LEN" } = scalar @seqs;
178 $record{ "SCORE_MEAN" } = sprintf ( "%.2f", Maasha::Calc::mean( \@scores ) );
180 return wantarray ? %record : \%record;
184 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<