1 # Copyright (C) 2011 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 require 'maasha/base36'
27 # Error class for all exceptions to do with SFF.
28 class SFFError < StandardError; end
31 BIT_MASK = (1 << BIT_SHIFT) - 1
33 # Class containing methods to parse SFF files:
34 # http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=formats&m=doc&s=format#sff
38 # Class method for opening SFF files.
40 ios = File.open(*args)
53 # Method to initialize an SFF object along with
54 # instance variables pertaining to the SFF header
65 @number_of_flows_per_read = 0
66 @flowgram_format_code = 0
69 @eight_byte_padding = 0
75 # Method to close ios.
80 # Method to iterate over each SFF entry.
82 while (read = read_parse) do
91 # Method to parse the SFF file's header section
92 # and load the information into the instance variables.
94 template = "NC4N2NNnnnC"
97 data = @io.read(31).unpack(template)
99 @magic_number = data[0]
100 @version = data[1 .. 4].join ""
101 @index_offset = (data[5] << bits_in_uint) | data[6]
102 @index_length = data[7]
103 @number_of_reads = data[8]
104 @header_length = data[9]
105 @key_length = data[10]
106 @number_of_flows_per_read = data[11]
107 @flowgram_format_code = data[12]
108 @flow_chars = @io.read(@number_of_flows_per_read).unpack("A*").join ""
109 @key_sequence = @io.read(@key_length).unpack("A*").join ""
118 # Method that reads the eight_byte_padding field found at the end of the
119 # data section and fast forwards, i.e. move the file read pointer,
120 # so that the length of the section is divisible by 8.
122 eight_byte_padding = 8 - (@io.pos % 8)
124 @io.read(eight_byte_padding) unless eight_byte_padding == 8
127 # Method to parse a read section of an SFF file.
129 return nil if @number_of_reads == @count
135 data = @io.read(16).unpack(template)
137 read.read_header_length = data[0]
138 read.name_length = data[1]
139 read.number_of_bases = data[2]
140 read.clip_qual_left = data[3]
141 read.clip_qual_right = data[4]
142 read.clip_adapter_left = data[5]
143 read.clip_adaptor_right = data[6]
144 read.name = @io.read(read.name_length).unpack("A*").join ""
148 @io.read(2 * @number_of_flows_per_read) # skip through flowgram_values
149 @io.read(read.number_of_bases) # skip through flow_index_per_base
151 # NB! Parsing of flowgram_values and flow_index_per_base is currently disabled since these are not needed.
152 # read.flowgram_values = @io.read(2 * @number_of_flows_per_read).unpack("n*").map { |val| val = sprintf("%.2f", val * 0.01) }
153 # flow_index_per_base = @io.read(read.number_of_bases).unpack("C*")
154 # (1 ... flow_index_per_base.length).each { |i| flow_index_per_base[i] += flow_index_per_base[i - 1] }
155 # read.flow_index_per_base = flow_index_per_base
157 read.bases = @io.read(read.number_of_bases).unpack("A*").join ""
158 read.quality_scores = @io.read(read.number_of_bases).unpack("C*")
167 # Method to check the magic number of an SFF file.
168 # Raises an error if the magic number don't match.
169 def check_magic_number
170 raise SFFError, "Badly formatted SFF file." unless @magic_number == 779314790
173 # Method to check the version number of an SFF file.
174 # Raises an error if the version don't match.
176 raise SFFError, "Wrong version: #{@version}" unless @version.to_i == 1
179 # Method to check the header length of an SFF file.
180 # Raises an error if the header length don't match
181 # the file position after reading the header section.
182 def check_header_length
183 raise SFFError, "Bad header length: #{header_length}" unless @io.pos == @header_length
187 # Class containing data accessor methods for an SFF entry and methods
188 # for manipulating this entry.
190 attr_accessor :read_header_length, :name_length, :number_of_bases,
191 :clip_qual_left, :clip_qual_right, :clip_adapter_left, :clip_adaptor_right,
192 :name, :flowgram_values, :flow_index_per_base, :bases, :quality_scores,
195 # Method that converts a Read object's data to a Biopiece record (a hash).
201 hash[:SEQ_NAME] = self.name
202 hash[:SEQ] = self.bases
203 hash[:SEQ_LEN] = self.bases.length
204 hash[:CLIP_QUAL_LEFT] = self.clip_qual_left - 1
205 hash[:CLIP_QUAL_RIGHT] = self.clip_qual_right - 1
206 hash[:CLIP_ADAPTOR_LEFT] = self.clip_adapter_left - 1
207 hash[:CLIP_ADAPTOR_RIGHT] = self.clip_adaptor_right - 1
208 hash[:SCORES] = self.quality_scores.map { |i| (i += 64).chr }.join ""
209 hash[:X_POS] = self.x_pos
210 hash[:Y_POS] = self.y_pos
215 # Method that soft masks the sequence (i.e. lowercases sequence) according to
216 # clip_qual_left and clip_qual_right information.
218 left = self.bases[0 ... self.clip_qual_left - 1].downcase
219 middle = self.bases[self.clip_qual_left - 1 ... self.clip_qual_right]
220 right = self.bases[self.clip_qual_right ... self.bases.length].downcase
222 self.bases = left + middle + right
225 # Method that clips sequence (i.e. trims) according to
226 # clip_qual_left and clip_qual_right information.
228 self.bases = self.bases[self.clip_qual_left - 1 ... self.clip_qual_right]
229 self.quality_scores = self.quality_scores[self.clip_qual_left - 1 ... self.clip_qual_right]
234 # Method that extracts the X/Y coordinates from
235 # an SFF read name encoded with base36.
237 base36 = self.name[-5, 5]
238 num = Base36.decode(base36)
240 self.x_pos = num >> BIT_SHIFT
241 self.y_pos = num & BIT_MASK