1 # as published by the Free Software Foundation; either version 2
2 # of the License, or (at your option) any later version.
4 # This program is distributed in the hope that it will be useful,
5 # but WITHOUT ANY WARRANTY; without even the implied warranty of
6 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
7 # GNU General Public License for more details.
9 # You should have received a copy of the GNU General Public License
10 # along with this program; if not, write to the Free Software
11 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
13 # http://www.gnu.org/copyleft/gpl.html
15 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
17 # This software is part of the Biopieces framework (www.biopieces.org).
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # SAM format version v1.4-r962 - April 17, 2011
23 # http://samtools.sourceforge.net/SAM1.pdf
26 require 'maasha/filesys'
29 # Error class for all exceptions to do with Genbank.
30 class SamError < StandardError; end
32 REGEX_HEADER = Regexp.new(/^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/)
33 REGEX_COMMENT = Regexp.new(/^@CO\t.*/)
35 # Class to parse and write SAM files.
39 # Method to initialize a Sam object.
40 def initialize(io = nil)
45 # Method to parse the header of a SAM file.
46 # Each header line should match:
47 # /^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ or /^@CO\t.*/.
48 # Tags containing lowercase letters are reserved for end users.
50 @io.each_line do |line|
51 if line =~ /^@([A-Za-z][A-Za-z])/
57 when 'HD' then parse_header(line)
58 when 'SQ' then parse_sequence(line)
59 when 'RG' then parse_read_group(line)
60 when 'PG' then parse_program(line)
61 when 'CO' then parse_comment(line)
63 raise SamError, "Unknown header tag: #{tag}"
71 return @header_hash.empty? ? nil : @header_hash
75 @io.each_line do |line|
77 entry = parse_alignment(line.chomp)
79 yield entry if block_given?
86 # Method to subparse header lines.
87 def parse_header(line)
89 fields = line.split("\t")
91 if fields[1] =~ /^VN:([0-9]+\.[0-9]+)$/
94 raise SamError, "Bad version number: #{fields[1]}"
98 if fields[2] =~ /^SO:(unknown|unsorted|queryname|coordinate)$/
101 raise SamError, "Bad sort order: #{fields[2]}"
105 @header_hash[:HD] = hash
108 # Method to subparse sequence lines.
109 def parse_sequence(line)
110 @header_hash[:SQ] = Hash.new unless @header_hash[:SQ].is_a? Hash
113 fields = line.split("\t")
115 if fields[1] =~ /^SN:([!-)+-<>-~][!-~]*)$/
118 raise SamError, "Bad sequence name: #{fields[1]}"
121 if fields[2] =~ /^LN:(\d+)$/
124 raise SamError, "Bad sequence length: #{fields[2]}"
127 (3 ... fields.size).each do |i|
128 if fields[i] =~ /^(AS|M5|SP|UR):([ -~]+)$/
131 raise SamError, "Bad sequence tag: #{fields[i]}"
135 @header_hash[:SQ][:SN] = Hash.new unless @header_hash[:SQ][:SN].is_a? Hash
137 if @header_hash[:SQ][:SN].has_key? seq_name
138 raise SamError, "Non-unique sequence name: #{seq_name}"
140 @header_hash[:SQ][:SN][seq_name] = hash
144 # Method to subparse read group lines.
145 def parse_read_group(line)
146 @header_hash[:RG] = Hash.new unless @header_hash[:RG].is_a? Hash
149 fields = line.split("\t")
151 if fields[1] =~ /^ID:([ -~]+)$/
154 raise SamError, "Bad read group identifier: #{fields[1]}"
157 (2 ... fields.size).each do |i|
158 if fields[i] =~ /^(CN|DS|DT|FO|KS|LB|PG|PI|PL|PU|SM):([ -~]+)$/
161 raise SamError, "Bad read group tag: #{fields[i]}"
166 unless hash[:FO] =~ /^\*|[ACMGRSVTWYHKDBN]+$/
167 raise SamError, "Bad flow order: #{hash[:FO]}"
172 unless hash[:PL] =~ /^(CAPILLARY|LS454|ILLUMINA|SOLID|HELICOS|IONTORRENT|PACBIO)$/
173 raise SamError, "Bad platform: #{hash[:PL]}"
177 @header_hash[:RG][:ID] = Hash.new unless @header_hash[:RG][:ID].is_a? Hash
179 if @header_hash[:RG][:ID].has_key? id
180 raise SamError, "Non-unique read group identifier: #{id}"
182 @header_hash[:RG][:ID][id] = hash
186 # Method to subparse program lines.
187 def parse_program(line)
188 @header_hash[:PG] = Hash.new unless @header_hash[:PG].is_a? Hash
191 fields = line.split("\t")
193 if fields[1] =~ /^ID:([ -~]+)$/
196 raise SamError, "Bad program record identifier: #{fields[1]}"
199 (2 ... fields.size).each do |i|
200 if fields[i] =~ /^(PN|CL|PP|VN):([ -~]+)$/
203 raise SamError, "Bad program record tag: #{fields[i]}"
207 @header_hash[:PG][:ID] = Hash.new unless @header_hash[:PG][:ID].is_a? Hash
209 if @header_hash[:PG][:ID].has_key? id
210 raise SamError, "Non-unique program record identifier: #{id}"
212 @header_hash[:PG][:ID][id] = hash
216 # Method to subparse comment lines.
217 def parse_comment(line)
218 @header_hash[:CO] = Array.new unless @header_hash[:CO].is_a? Array
220 if line =~ /^@CO\t(.+)/
221 @header_hash[:CO] << $1
223 raise SamError, "Bad comment line: #{line}"
227 # Method to subparse alignment lines.
228 def parse_alignment(line)
229 fields = line.split("\t")
231 raise SamError, "Bad number of fields: #{fields.size} < 11" if fields.size < 11
234 flag = fields[1].to_i
237 mapq = fields[4].to_i
240 pnext = fields[7].to_i
241 tlen = fields[8].to_i
245 raise SamError, "Bad qname: #{qname}" unless qname =~ /^[!-?A-~]{1,255}$/
246 raise SamError, "Bad flag: #{flag}" unless (0 .. 2**16 - 1).include? flag
247 raise SamError, "Bad rname: #{rname}" unless rname =~ /^(\*|[!-()+-<>-~][!-~]*)$/
248 raise SamError, "Bad pos: #{pos}" unless (0 .. 2**29 - 1).include? pos
249 raise SamError, "Bad mapq: #{mapq}" unless (0 .. 2**8 - 1).include? mapq
250 raise SamError, "Bad cigar: #{cigar}" unless cigar =~ /^(\*|([0-9]+[MIDNSHPX=])+)$/
251 raise SamError, "Bad rnext: #{rnext}" unless rnext =~ /^(\*|=|[!-()+-<>-~][!-~]*)$/
252 raise SamError, "Bad pnext: #{pnext}" unless (0 .. 2**29 - 1).include? pnext
253 raise SamError, "Bad tlen: #{tlen}" unless (-2**29 + 1 .. 2**29 - 1).include? tlen
254 raise SamError, "Bad seq: #{seq}" unless seq =~ /^(\*|[A-Za-z=.]+)$/
255 raise SamError, "Bad qual: #{qual}" unless qual =~ /^[!-~]+$/
258 entry[:QNAME] = qname
260 entry[:RNAME] = rname
263 entry[:CIGAR] = cigar
264 entry[:RNEXT] = rnext
265 entry[:PNEXT] = pnext
275 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<