REGEX_HEADER = Regexp.new(/^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/)
REGEX_COMMENT = Regexp.new(/^@CO\t.*/)
+# Class to parse and write SAM files.
class Sam < Filesys
attr_accessor :io
+ # Method to initialize a Sam object.
def initialize(io = nil)
@io = io
@header_hash = {}
end
- # Each header line should match: /^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ or /^@CO\t.*/.
+ # Method to parse the header of a SAM file.
+ # Each header line should match:
+ # /^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ or /^@CO\t.*/.
# Tags containing lowercase letters are reserved for end users.
def header
@io.each_line do |line|
- line.chomp!
-
if line =~ /^@([A-Za-z][A-Za-z])/
+ line.chomp!
+
tag = $1
case tag
return @header_hash.empty? ? nil : @header_hash
end
+ def each
+ @io.each_line do |line|
+ unless line[0] == '@'
+ entry = parse_alignment(line.chomp)
+
+ yield entry if block_given?
+ end
+ end
+ end
+
private
+ # Method to subparse header lines.
def parse_header(line)
hash = {}
fields = line.split("\t")
@header_hash[:HD] = hash
end
+ # Method to subparse sequence lines.
def parse_sequence(line)
@header_hash[:SQ] = Hash.new unless @header_hash[:SQ].is_a? Hash
hash = {}
end
end
+ # Method to subparse read group lines.
def parse_read_group(line)
@header_hash[:RG] = Hash.new unless @header_hash[:RG].is_a? Hash
hash = {}
end
end
+ # Method to subparse program lines.
def parse_program(line)
@header_hash[:PG] = Hash.new unless @header_hash[:PG].is_a? Hash
hash = {}
end
end
+ # Method to subparse comment lines.
def parse_comment(line)
@header_hash[:CO] = Array.new unless @header_hash[:CO].is_a? Array
end
end
- def get_entry
+ # Method to subparse alignment lines.
+ def parse_alignment(line)
+ fields = line.split("\t")
+
+ raise SamError, "Bad number of fields: #{fields.size} < 11" if fields.size < 11
+
+ qname = fields[0]
+ flag = fields[1].to_i
+ rname = fields[2]
+ pos = fields[3].to_i
+ mapq = fields[4].to_i
+ cigar = fields[5]
+ rnext = fields[6]
+ pnext = fields[7].to_i
+ tlen = fields[8].to_i
+ seq = fields[9]
+ qual = fields[10]
+
+ raise SamError, "Bad qname: #{qname}" unless qname =~ /^[!-?A-~]{1,255}$/
+ raise SamError, "Bad flag: #{flag}" unless (0 .. 2**16 - 1).include? flag
+ raise SamError, "Bad rname: #{rname}" unless rname =~ /^(\*|[!-()+-<>-~][!-~]*)$/
+ raise SamError, "Bad pos: #{pos}" unless (0 .. 2**29 - 1).include? pos
+ raise SamError, "Bad mapq: #{mapq}" unless (0 .. 2**8 - 1).include? mapq
+ raise SamError, "Bad cigar: #{cigar}" unless cigar =~ /^(\*|([0-9]+[MIDNSHPX=])+)$/
+ raise SamError, "Bad rnext: #{rnext}" unless rnext =~ /^(\*|=|[!-()+-<>-~][!-~]*)$/
+ raise SamError, "Bad pnext: #{pnext}" unless (0 .. 2**29 - 1).include? pnext
+ raise SamError, "Bad tlen: #{tlen}" unless (-2**29 + 1 .. 2**29 - 1).include? tlen
+ raise SamError, "Bad seq: #{seq}" unless seq =~ /^(\*|[A-Za-z=.]+)$/
+ raise SamError, "Bad qual: #{qual}" unless qual =~ /^[!-~]+$/
+
+ entry = {}
+ entry[:QNAME] = qname
+ entry[:FLAG] = flag
+ entry[:RNAME] = rname
+ entry[:POS] = pos
+ entry[:MAPQ] = mapq
+ entry[:CIGAR] = cigar
+ entry[:RNEXT] = rnext
+ entry[:PNEXT] = pnext
+ entry[:TLEN] = tlen
+ entry[:SEQ] = seq
+ entry[:QUAL] = qual
+
+ entry
end
end
sam = Sam.new(StringIO.new("@CO\tfubar"))
assert_nothing_raised { sam.header }
end
+
+ def test_Sam_each_with_bad_field_count_raises
+ fields = []
+
+ (0 ... 11).each do |i|
+ sam = Sam.new(StringIO.new(fields.join("\t") + $/))
+ assert_raise(SamError) { sam.each }
+ fields << "*"
+ end
+ end
+
+ def test_Sam_each_with_ok_field_count_dont_raise
+ sam = Sam.new(SAM_TEST)
+ assert_nothing_raised { sam.each }
+ end
+
+# def test_Sam_each_with_bad_qname_raises
+# end
+
+# def test_Sam_each_with_ok_qname_dont_raise
+# end
end