1 # Copyright (C) 2007-2011 Martin A. Hansen.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 # http://www.gnu.org/copyleft/gpl.html
19 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
21 # This software is part of the Biopieces framework (www.biopieces.org).
23 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # SAM format version v1.4-r962 - April 17, 2011
27 # http://samtools.sourceforge.net/SAM1.pdf
30 require 'maasha/filesys'
33 # Error class for all exceptions to do with Genbank.
34 class SamError < StandardError; end
36 REGEX_HEADER = Regexp.new(/^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/)
37 REGEX_COMMENT = Regexp.new(/^@CO\t.*/)
39 # Class to parse and write SAM files.
41 attr_accessor :io, :header
43 # Method to initialize a Sam object.
44 def initialize(io = nil)
52 @io.each_line do |line|
54 entry = parse_alignment(line.chomp)
56 yield entry if block_given?
63 # Method to parse the header section of a SAM file.
64 # Each header line should match:
65 # /^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ or /^@CO\t.*/.
66 # Tags containing lowercase letters are reserved for end users.
68 @io.each_line do |line|
69 if line =~ /^@([A-Za-z][A-Za-z])/
75 when 'HD' then subparse_header(line)
76 when 'SQ' then subparse_sequence(line)
77 when 'RG' then subparse_read_group(line)
78 when 'PG' then subparse_program(line)
79 when 'CO' then subparse_comment(line)
81 raise SamError, "Unknown header tag: #{tag}"
89 return @header.empty? ? nil : @header
92 # Method to subparse header lines.
93 def subparse_header(line)
95 fields = line.split("\t")
97 if fields[1] =~ /^VN:([0-9]+\.[0-9]+)$/
100 raise SamError, "Bad version number: #{fields[1]}"
104 if fields[2] =~ /^SO:(unknown|unsorted|queryname|coordinate)$/
107 raise SamError, "Bad sort order: #{fields[2]}"
114 # Method to subparse sequence lines.
115 def subparse_sequence(line)
116 @header[:SQ] = Hash.new unless @header[:SQ].is_a? Hash
119 fields = line.split("\t")
121 if fields[1] =~ /^SN:([!-)+-<>-~][!-~]*)$/
124 raise SamError, "Bad sequence name: #{fields[1]}"
127 if fields[2] =~ /^LN:(\d+)$/
130 raise SamError, "Bad sequence length: #{fields[2]}"
133 (3 ... fields.size).each do |i|
134 if fields[i] =~ /^(AS|M5|SP|UR):([ -~]+)$/
137 raise SamError, "Bad sequence tag: #{fields[i]}"
141 @header[:SQ][:SN] = Hash.new unless @header[:SQ][:SN].is_a? Hash
143 if @header[:SQ][:SN].has_key? seq_name
144 raise SamError, "Non-unique sequence name: #{seq_name}"
146 @header[:SQ][:SN][seq_name] = hash
150 # Method to subparse read group lines.
151 def subparse_read_group(line)
152 @header[:RG] = Hash.new unless @header[:RG].is_a? Hash
155 fields = line.split("\t")
157 if fields[1] =~ /^ID:([ -~]+)$/
160 raise SamError, "Bad read group identifier: #{fields[1]}"
163 (2 ... fields.size).each do |i|
164 if fields[i] =~ /^(CN|DS|DT|FO|KS|LB|PG|PI|PL|PU|SM):([ -~]+)$/
167 raise SamError, "Bad read group tag: #{fields[i]}"
172 unless hash[:FO] =~ /^\*|[ACMGRSVTWYHKDBN]+$/
173 raise SamError, "Bad flow order: #{hash[:FO]}"
178 unless hash[:PL] =~ /^(CAPILLARY|LS454|ILLUMINA|SOLID|HELICOS|IONTORRENT|PACBIO)$/
179 raise SamError, "Bad platform: #{hash[:PL]}"
183 @header[:RG][:ID] = Hash.new unless @header[:RG][:ID].is_a? Hash
185 if @header[:RG][:ID].has_key? id
186 raise SamError, "Non-unique read group identifier: #{id}"
188 @header[:RG][:ID][id] = hash
192 # Method to subparse program lines.
193 def subparse_program(line)
194 @header[:PG] = Hash.new unless @header[:PG].is_a? Hash
197 fields = line.split("\t")
199 if fields[1] =~ /^ID:([ -~]+)$/
202 raise SamError, "Bad program record identifier: #{fields[1]}"
205 (2 ... fields.size).each do |i|
206 if fields[i] =~ /^(PN|CL|PP|VN):([ -~]+)$/
209 raise SamError, "Bad program record tag: #{fields[i]}"
213 @header[:PG][:ID] = Hash.new unless @header[:PG][:ID].is_a? Hash
215 if @header[:PG][:ID].has_key? id
216 raise SamError, "Non-unique program record identifier: #{id}"
218 @header[:PG][:ID][id] = hash
222 # Method to subparse comment lines.
223 def subparse_comment(line)
224 @header[:CO] = Array.new unless @header[:CO].is_a? Array
226 if line =~ /^@CO\t(.+)/
229 raise SamError, "Bad comment line: #{line}"
233 # Method to subparse alignment lines.
234 def parse_alignment(line)
235 fields = line.split("\t")
237 raise SamError, "Bad number of fields: #{fields.size} < 11" if fields.size < 11
240 flag = fields[1].to_i
243 mapq = fields[4].to_i
246 pnext = fields[7].to_i
247 tlen = fields[8].to_i
251 raise SamError, "Bad qname: #{qname}" unless qname =~ /^[!-?A-~]{1,255}$/
252 raise SamError, "Bad flag: #{flag}" unless (0 .. 2**16 - 1).include? flag
253 raise SamError, "Bad rname: #{rname}" unless rname =~ /^(\*|[!-()+-<>-~][!-~]*)$/
254 raise SamError, "Bad pos: #{pos}" unless (0 .. 2**29 - 1).include? pos
255 raise SamError, "Bad mapq: #{mapq}" unless (0 .. 2**8 - 1).include? mapq
256 raise SamError, "Bad cigar: #{cigar}" unless cigar =~ /^(\*|([0-9]+[MIDNSHPX=])+)$/
257 raise SamError, "Bad rnext: #{rnext}" unless rnext =~ /^(\*|=|[!-()+-<>-~][!-~]*)$/
258 raise SamError, "Bad pnext: #{pnext}" unless (0 .. 2**29 - 1).include? pnext
259 raise SamError, "Bad tlen: #{tlen}" unless (-2**29 + 1 .. 2**29 - 1).include? tlen
260 raise SamError, "Bad seq: #{seq}" unless seq =~ /^(\*|[A-Za-z=.]+)$/
261 raise SamError, "Bad qual: #{qual}" unless qual =~ /^[!-~]+$/
266 entry[:QNAME] = qname
268 entry[:RNAME] = rname
271 entry[:CIGAR] = cigar
272 entry[:RNEXT] = rnext
273 entry[:PNEXT] = pnext
281 # Method to check if rname, when not '*' and
282 # @SQ header lines are present, is located in
284 def check_rname(rname)
285 unless @header.empty? or rname == '*'
286 unless @header[:SQ][:SN].has_key? rname.to_sym
287 raise SamError, "rname not found in header hash: #{rname}"
294 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<