1 # Copyright (C) 2011 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 require 'maasha/base36'
28 # Error class for all exceptions to do with SFF.
29 class SFFError < StandardError; end
32 BIT_MASK = (1 << BIT_SHIFT) - 1
34 # Class containing methods to parse SFF files:
35 # http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=formats&m=doc&s=format#sff
39 # Class method for opening SFF files.
41 ios = File.open(*args)
54 # Method to initialize an SFF object along with
55 # instance variables pertaining to the SFF header
66 @number_of_flows_per_read = 0
67 @flowgram_format_code = 0
70 @eight_byte_padding = 0
76 # Method to close ios.
81 # Method to iterate over each SFF entry.
83 while (read = read_parse) do
92 # Method to parse the SFF file's header section
93 # and load the information into the instance variables.
95 template = "NC4N2NNnnnC"
98 data = @io.read(31).unpack(template)
100 @magic_number = data[0]
101 @version = data[1 .. 4].join ""
102 @index_offset = (data[5] << bits_in_uint) | data[6]
103 @index_length = data[7]
104 @number_of_reads = data[8]
105 @header_length = data[9]
106 @key_length = data[10]
107 @number_of_flows_per_read = data[11]
108 @flowgram_format_code = data[12]
109 @flow_chars = @io.read(@number_of_flows_per_read).unpack("A*").join ""
110 @key_sequence = @io.read(@key_length).unpack("A*").join ""
119 # Method that reads the eight_byte_padding field found at the end of the
120 # data section and fast forwards, i.e. move the file read pointer,
121 # so that the length of the section is divisible by 8.
123 eight_byte_padding = 8 - (@io.pos % 8)
125 @io.read(eight_byte_padding) unless eight_byte_padding == 8
128 # Method to parse a read section of an SFF file.
130 return nil if @number_of_reads == @count
136 data = @io.read(16).unpack(template)
138 read.read_header_length = data[0]
139 read.name_length = data[1]
140 read.number_of_bases = data[2]
141 read.clip_qual_left = data[3]
142 read.clip_qual_right = data[4]
143 read.clip_adapter_left = data[5]
144 read.clip_adaptor_right = data[6]
145 read.name = @io.read(read.name_length).unpack("A*").join ""
149 @io.read(2 * @number_of_flows_per_read) # skip through flowgram_values
150 @io.read(read.number_of_bases) # skip through flow_index_per_base
152 # NB! Parsing of flowgram_values and flow_index_per_base is currently disabled since these are not needed.
153 # read.flowgram_values = @io.read(2 * @number_of_flows_per_read).unpack("n*").map { |val| val = sprintf("%.2f", val * 0.01) }
154 # flow_index_per_base = @io.read(read.number_of_bases).unpack("C*")
155 # (1 ... flow_index_per_base.length).each { |i| flow_index_per_base[i] += flow_index_per_base[i - 1] }
156 # read.flow_index_per_base = flow_index_per_base
158 read.bases = @io.read(read.number_of_bases).unpack("A*").join ""
159 read.quality_scores = @io.read(read.number_of_bases).unpack("C*")
168 # Method to check the magic number of an SFF file.
169 # Raises an error if the magic number don't match.
170 def check_magic_number
171 raise SFFError, "Badly formatted SFF file." unless @magic_number == 779314790
174 # Method to check the version number of an SFF file.
175 # Raises an error if the version don't match.
177 raise SFFError, "Wrong version: #{@version}" unless @version.to_i == 1
180 # Method to check the header length of an SFF file.
181 # Raises an error if the header length don't match
182 # the file position after reading the header section.
183 def check_header_length
184 raise SFFError, "Bad header length: #{header_length}" unless @io.pos == @header_length
188 # Class containing data accessor methods for an SFF entry and methods
189 # for manipulating this entry.
191 attr_accessor :read_header_length, :name_length, :number_of_bases,
192 :clip_qual_left, :clip_qual_right, :clip_adapter_left, :clip_adaptor_right,
193 :name, :flowgram_values, :flow_index_per_base, :bases, :quality_scores,
196 # Method that converts a Read object's data to a Biopiece record (a hash).
198 # Simulated SFF files may not have coordinates.
206 hash[:SEQ_NAME] = self.name
207 hash[:SEQ] = self.bases
208 hash[:SEQ_LEN] = self.bases.length
209 hash[:CLIP_QUAL_LEFT] = self.clip_qual_left - 1
210 hash[:CLIP_QUAL_RIGHT] = self.clip_qual_right - 1
211 hash[:CLIP_ADAPTOR_LEFT] = self.clip_adapter_left - 1
212 hash[:CLIP_ADAPTOR_RIGHT] = self.clip_adaptor_right - 1
213 hash[:SCORES] = self.quality_scores.map { |i| (i += Seq::SCORE_BASE).chr }.join ""
214 hash[:X_POS] = self.x_pos
215 hash[:Y_POS] = self.y_pos
220 # Method that converts a Read object's data to a Seq object.
225 qual: self.quality_scores.map { |i| (i += Seq::SCORE_BASE).chr }.join("")
229 # Method that soft masks the sequence (i.e. lowercases sequence) according to
230 # clip_qual_left and clip_qual_right information.
232 left = self.bases[0 ... self.clip_qual_left - 1].downcase
233 middle = self.bases[self.clip_qual_left - 1 ... self.clip_qual_right]
234 right = self.bases[self.clip_qual_right ... self.bases.length].downcase
236 self.bases = left + middle + right
241 # Method that clips sequence (i.e. trims) according to
242 # clip_qual_left and clip_qual_right information.
244 self.bases = self.bases[self.clip_qual_left - 1 ... self.clip_qual_right]
245 self.quality_scores = self.quality_scores[self.clip_qual_left - 1 ... self.clip_qual_right]
252 # Method that extracts the X/Y coordinates from
253 # an SFF read name encoded with base36.
255 base36 = self.name[-5, 5]
256 num = Base36.decode(base36)
258 self.x_pos = num >> BIT_SHIFT
259 self.y_pos = num & BIT_MASK