# Class to parse and write SAM files.
class Sam < Filesys
- attr_accessor :io
+ attr_accessor :io, :header
# Method to initialize a Sam object.
def initialize(io = nil)
- @io = io
- @header_hash = {}
+ @io = io
+ @header = {}
+
+ parse_header
end
- # Method to parse the header of a SAM file.
+ def each
+ @io.each_line do |line|
+ unless line[0] == '@'
+ entry = parse_alignment(line.chomp)
+
+ yield entry if block_given?
+ end
+ end
+ end
+
+ private
+
+ # Method to parse the header section of a SAM file.
# Each header line should match:
# /^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ or /^@CO\t.*/.
# Tags containing lowercase letters are reserved for end users.
- def header
+ def parse_header
@io.each_line do |line|
if line =~ /^@([A-Za-z][A-Za-z])/
line.chomp!
tag = $1
case tag
- when 'HD' then parse_header(line)
- when 'SQ' then parse_sequence(line)
- when 'RG' then parse_read_group(line)
- when 'PG' then parse_program(line)
- when 'CO' then parse_comment(line)
+ when 'HD' then subparse_header(line)
+ when 'SQ' then subparse_sequence(line)
+ when 'RG' then subparse_read_group(line)
+ when 'PG' then subparse_program(line)
+ when 'CO' then subparse_comment(line)
else
raise SamError, "Unknown header tag: #{tag}"
end
end
end
- return @header_hash.empty? ? nil : @header_hash
- end
-
- def each
- @io.each_line do |line|
- unless line[0] == '@'
- entry = parse_alignment(line.chomp)
-
- yield entry if block_given?
- end
- end
+ return @header.empty? ? nil : @header
end
- private
-
# Method to subparse header lines.
- def parse_header(line)
+ def subparse_header(line)
hash = {}
fields = line.split("\t")
end
end
- @header_hash[:HD] = hash
+ @header[:HD] = hash
end
# Method to subparse sequence lines.
- def parse_sequence(line)
- @header_hash[:SQ] = Hash.new unless @header_hash[:SQ].is_a? Hash
+ def subparse_sequence(line)
+ @header[:SQ] = Hash.new unless @header[:SQ].is_a? Hash
hash = {}
fields = line.split("\t")
end
end
- @header_hash[:SQ][:SN] = Hash.new unless @header_hash[:SQ][:SN].is_a? Hash
+ @header[:SQ][:SN] = Hash.new unless @header[:SQ][:SN].is_a? Hash
- if @header_hash[:SQ][:SN].has_key? seq_name
+ if @header[:SQ][:SN].has_key? seq_name
raise SamError, "Non-unique sequence name: #{seq_name}"
else
- @header_hash[:SQ][:SN][seq_name] = hash
+ @header[:SQ][:SN][seq_name] = hash
end
end
# Method to subparse read group lines.
- def parse_read_group(line)
- @header_hash[:RG] = Hash.new unless @header_hash[:RG].is_a? Hash
+ def subparse_read_group(line)
+ @header[:RG] = Hash.new unless @header[:RG].is_a? Hash
hash = {}
fields = line.split("\t")
end
end
- @header_hash[:RG][:ID] = Hash.new unless @header_hash[:RG][:ID].is_a? Hash
+ @header[:RG][:ID] = Hash.new unless @header[:RG][:ID].is_a? Hash
- if @header_hash[:RG][:ID].has_key? id
+ if @header[:RG][:ID].has_key? id
raise SamError, "Non-unique read group identifier: #{id}"
else
- @header_hash[:RG][:ID][id] = hash
+ @header[:RG][:ID][id] = hash
end
end
# Method to subparse program lines.
- def parse_program(line)
- @header_hash[:PG] = Hash.new unless @header_hash[:PG].is_a? Hash
+ def subparse_program(line)
+ @header[:PG] = Hash.new unless @header[:PG].is_a? Hash
hash = {}
fields = line.split("\t")
end
end
- @header_hash[:PG][:ID] = Hash.new unless @header_hash[:PG][:ID].is_a? Hash
+ @header[:PG][:ID] = Hash.new unless @header[:PG][:ID].is_a? Hash
- if @header_hash[:PG][:ID].has_key? id
+ if @header[:PG][:ID].has_key? id
raise SamError, "Non-unique program record identifier: #{id}"
else
- @header_hash[:PG][:ID][id] = hash
+ @header[:PG][:ID][id] = hash
end
end
# Method to subparse comment lines.
- def parse_comment(line)
- @header_hash[:CO] = Array.new unless @header_hash[:CO].is_a? Array
+ def subparse_comment(line)
+ @header[:CO] = Array.new unless @header[:CO].is_a? Array
if line =~ /^@CO\t(.+)/
- @header_hash[:CO] << $1
+ @header[:CO] << $1
else
raise SamError, "Bad comment line: #{line}"
end
raise SamError, "Bad seq: #{seq}" unless seq =~ /^(\*|[A-Za-z=.]+)$/
raise SamError, "Bad qual: #{qual}" unless qual =~ /^[!-~]+$/
+ check_rname(rname)
+
entry = {}
entry[:QNAME] = qname
entry[:FLAG] = flag
entry
end
+
+ # Method to check if rname, when not '*' and
+ # @SQ header lines are present, is located in
+ # the header hash.
+ def check_rname(rname)
+ unless @header.empty? or rname == '*'
+ unless @header[:SQ][:SN].has_key? rname.to_sym
+ raise SamError, "rname not found in header hash: #{rname}"
+ end
+ end
+ end
end
require 'pp'
require 'stringio'
-SAM_TEST =
+SAM_DATA =
%{@HD\tVN:1.3\tSO:coordinate
@SQ\tSN:ref\tLN:45
@CO\tMyComment
class SamTest < Test::Unit::TestCase
def setup
- @sam = Sam.new(StringIO.new(SAM_TEST))
+ @sam = Sam.new(StringIO.new(SAM_DATA))
end
# def test_Sam_header_without_entry_returns_nil
# end
def test_Sam_header_parse_with_missing_version_number_raises
- sam = Sam.new(StringIO.new("@HD"))
- assert_raise(SamError) { sam.header }
+ assert_raise(SamError) { Sam.new(StringIO.new("@HD")) }
end
def test_Sam_header_parse_with_bad_version_number_raises
- sam = Sam.new(StringIO.new("@HD\tXN:1.3"))
- assert_raise(SamError) { sam.header }
+ assert_raise(SamError) { Sam.new(StringIO.new("@HD\tXN:1.3")) }
end
def test_Sam_header_parse_with_ok_version_number_returns_correctly
end
def test_Sam_header_parse_with_bad_sort_order_raises
- sam = Sam.new(StringIO.new("@HD\tVN:1.3\tSO:fish"))
- assert_raise(SamError) { sam.header }
+ assert_raise(SamError) { Sam.new(StringIO.new("@HD\tVN:1.3\tSO:fish")) }
end
def test_Sam_header_parse_with_ok_sort_order_returns_correctly
end
def test_Sam_header_parse_with_missing_sequence_name_raises
- sam = Sam.new(StringIO.new("@SQ"))
- assert_raise(SamError) { sam.header }
+ assert_raise(SamError) { Sam.new(StringIO.new("@SQ")) }
end
def test_Sam_header_parse_with_bad_sequence_name_raises
- sam = Sam.new(StringIO.new("@SQ\tSN:"))
- assert_raise(SamError) { sam.header }
+ assert_raise(SamError) { Sam.new(StringIO.new("@SQ\tSN:")) }
end
def test_Sam_header_parse_with_ok_sequence_name_returns_correctly
end
def test_Sam_header_parse_with_duplicate_sequence_name_raises
- sam = Sam.new(StringIO.new("@SQ\tSN:ref\n@SQ\tSN:ref"))
- assert_raise(SamError) { sam.header[:SQ][:SN][:ref] }
+ assert_raise(SamError) { Sam.new(StringIO.new("@SQ\tSN:ref\n@SQ\tSN:ref")) }
end
def test_Sam_header_parse_with_missing_sequence_length_raises
- sam = Sam.new(StringIO.new("@SQ\tSN:ref"))
- assert_raise(SamError) { sam.header }
+ assert_raise(SamError) { Sam.new(StringIO.new("@SQ\tSN:ref")) }
end
def test_Sam_header_parse_with_bad_sequence_length_raises
- sam = Sam.new(StringIO.new("@SQ\tSN:scaffold17_1_MH0083\tLN:x"))
- assert_raise(SamError) { sam.header }
+ assert_raise(SamError) { Sam.new(StringIO.new("@SQ\tSN:scaffold17_1_MH0083\tLN:x")) }
end
def test_Sam_header_parse_with_ok_sequence_length_returns_correctly
end
def test_Sam_header_parse_with_full_SQ_dont_raise
- sam = Sam.new("@SQ\tSN:ref\tLN:45\tAS:ident\tM5:87e6b2aedf51b1f9c89becfab9267f41\tSP:E.coli\tUR:http://www.biopieces.org")
+ sam = Sam.new(StringIO.new("@SQ\tSN:ref\tLN:45\tAS:ident\tM5:87e6b2aedf51b1f9c89becfab9267f41\tSP:E.coli\tUR:http://www.biopieces.org"))
assert_nothing_raised { sam.header }
end
def test_Sam_header_parse_with_bad_read_group_identifier_raises
- sam = Sam.new(StringIO.new("@RG\tID:"))
- assert_raise(SamError) { sam.header }
+ assert_raise(SamError) { Sam.new(StringIO.new("@RG\tID:")) }
end
def test_Sam_header_parse_with_missing_read_group_identifier_raises
- sam = Sam.new(StringIO.new("@RG"))
- assert_raise(SamError) { sam.header }
+ assert_raise(SamError) { Sam.new(StringIO.new("@RG")) }
end
def test_Sam_header_parse_with_duplicate_read_group_identifier_raises
- sam = Sam.new(StringIO.new("@RG\tID:123\n@RG\tID:123"))
- assert_raise(SamError) { sam.header }
+ assert_raise(SamError) { Sam.new(StringIO.new("@RG\tID:123\n@RG\tID:123")) }
end
def test_Sam_header_parse_with_ok_read_group_identifier_dont_raise
end
def test_Sam_header_parse_with_bad_flow_order_raises
- sam = Sam.new(StringIO.new("@RG\tID:123\tFO:3"))
- assert_raise(SamError) { sam.header }
+ assert_raise(SamError) { Sam.new(StringIO.new("@RG\tID:123\tFO:3")) }
end
def test_Sam_header_parse_with_ok_flow_order_dont_raise
end
def test_Sam_header_parse_with_bad_platform_raises
- sam = Sam.new(StringIO.new("@RG\tID:123\tPL:maersk"))
- assert_raise(SamError) { sam.header }
+ assert_raise(SamError) { Sam.new(StringIO.new("@RG\tID:123\tPL:maersk")) }
end
def test_Sam_header_parse_with_ok_platform_dont_raise
end
def test_Sam_header_parse_with_bad_program_identifier_raises
- sam = Sam.new(StringIO.new("@PG\tID:"))
- assert_raise(SamError) { sam.header }
+ assert_raise(SamError) { Sam.new(StringIO.new("@PG\tID:")) }
end
def test_Sam_header_parse_with_missing_program_identifier_raises
- sam = Sam.new(StringIO.new("@PG"))
- assert_raise(SamError) { sam.header }
+ assert_raise(SamError) { Sam.new(StringIO.new("@PG")) }
end
def test_Sam_header_parse_with_duplicate_program_identifier_raises
- sam = Sam.new(StringIO.new("@PG\tID:123\n@PG\tID:123"))
- assert_raise(SamError) { sam.header }
+ assert_raise(SamError) { Sam.new(StringIO.new("@PG\tID:123\n@PG\tID:123")) }
end
def test_Sam_header_parse_with_bad_comment_raises
- sam = Sam.new(StringIO.new("@CO\t"))
- assert_raise(SamError) { sam.header }
+ assert_raise(SamError) { Sam.new(StringIO.new("@CO\t")) }
end
def test_Sam_header_parse_with_ok_comment_dont_raise
end
def test_Sam_each_with_ok_field_count_dont_raise
- sam = Sam.new(SAM_TEST)
+ sam = Sam.new(StringIO.new(SAM_DATA))
assert_nothing_raised { sam.each }
end
def test_Sam_each_with_bad_qname_raises
- sam = Sam.new(" \t*\t*\t*\t*\t*\t*\t*\t*\t*\t*\n")
+ sam = Sam.new(StringIO.new(" \t*\t*\t*\t*\t*\t*\t*\t*\t*\t*\n"))
assert_raise(SamError) { sam.each }
end
def test_Sam_each_with_ok_qname_dont_raise
- sam = Sam.new("*\t*\t*\t*\t*\t*\t*\t*\t*\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t*\t*\t*\t*\t*\t*\t*\t*\t*\t*\n"))
assert_nothing_raised(SamError) { sam.each }
end
def test_Sam_each_with_bad_flag_raises
- sam = Sam.new("*\t-1\t*\t*\t*\t*\t*\t*\t*\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t-1\t*\t*\t*\t*\t*\t*\t*\t*\t*\n"))
assert_raise(SamError) { sam.each }
- sam = Sam.new("*\t65536\t*\t*\t*\t*\t*\t*\t*\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t65536\t*\t*\t*\t*\t*\t*\t*\t*\t*\n"))
assert_raise(SamError) { sam.each }
end
def test_Sam_each_with_ok_flag_dont_raise
- sam = Sam.new("*\t0\t*\t*\t*\t*\t*\t*\t*\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t0\t*\t*\t*\t*\t*\t*\t*\t*\t*\n"))
assert_nothing_raised { sam.each }
- sam = Sam.new("*\t65535\t*\t*\t*\t*\t*\t*\t*\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t65535\t*\t*\t*\t*\t*\t*\t*\t*\t*\n"))
assert_nothing_raised { sam.each }
end
def test_Sam_each_with_bad_rname_raises
- sam = Sam.new("*\t*\t \t*\t*\t*\t*\t*\t*\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t*\t \t*\t*\t*\t*\t*\t*\t*\t*\n"))
assert_raise(SamError) { sam.each }
end
def test_Sam_each_with_ok_rname_dont_raise
- sam = Sam.new("*\t*\t*\t*\t*\t*\t*\t*\t*\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t*\t*\t*\t*\t*\t*\t*\t*\t*\t*\n"))
assert_nothing_raised { sam.each }
end
def test_Sam_each_with_bad_pos_raises
- sam = Sam.new("*\t*\t*\t-1\t*\t*\t*\t*\t*\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t*\t*\t-1\t*\t*\t*\t*\t*\t*\t*\n"))
assert_raise(SamError) { sam.each }
- sam = Sam.new("*\t*\t*\t536870912\t*\t*\t*\t*\t*\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t*\t*\t536870912\t*\t*\t*\t*\t*\t*\t*\n"))
assert_raise(SamError) { sam.each }
end
def test_Sam_each_with_ok_pos_dont_raise
- sam = Sam.new("*\t*\t*\t0\t*\t*\t*\t*\t*\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t*\t*\t0\t*\t*\t*\t*\t*\t*\t*\n"))
assert_nothing_raised { sam.each }
- sam = Sam.new("*\t*\t*\t536870911\t*\t*\t*\t*\t*\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t*\t*\t536870911\t*\t*\t*\t*\t*\t*\t*\n"))
assert_nothing_raised { sam.each }
end
def test_Sam_each_with_bad_mapq_raises
- sam = Sam.new("*\t*\t*\t*\t-1\t*\t*\t*\t*\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t*\t*\t*\t-1\t*\t*\t*\t*\t*\t*\n"))
assert_raise(SamError) { sam.each }
- sam = Sam.new("*\t*\t*\t*\t256\t*\t*\t*\t*\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t*\t*\t*\t256\t*\t*\t*\t*\t*\t*\n"))
assert_raise(SamError) { sam.each }
end
def test_Sam_each_with_ok_mapq_dont_raise
- sam = Sam.new("*\t*\t*\t*\t0\t*\t*\t*\t*\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t*\t*\t*\t0\t*\t*\t*\t*\t*\t*\n"))
assert_nothing_raised { sam.each }
- sam = Sam.new("*\t*\t*\t*\t255\t*\t*\t*\t*\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t*\t*\t*\t255\t*\t*\t*\t*\t*\t*\n"))
assert_nothing_raised { sam.each }
end
def test_Sam_each_with_bad_cigar_raises
- sam = Sam.new("*\t*\t*\t*\t*\t24\t*\t*\t*\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t*\t*\t*\t*\t24\t*\t*\t*\t*\t*\n"))
assert_raise(SamError) { sam.each }
end
def test_Sam_each_with_ok_cigar_dont_raise
- sam = Sam.new("*\t*\t*\t*\t*\t24M2I3D\t*\t*\t*\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t*\t*\t*\t*\t24M2I3D\t*\t*\t*\t*\t*\n"))
assert_nothing_raised { sam.each }
- sam = Sam.new("*\t*\t*\t*\t*\t*\t*\t*\t*\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t*\t*\t*\t*\t*\t*\t*\t*\t*\t*\n"))
assert_nothing_raised { sam.each }
end
def test_Sam_each_with_bad_rnext_raises
- sam = Sam.new("*\t*\t*\t*\t*\t*\t \t*\t*\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t*\t*\t*\t*\t*\t \t*\t*\t*\t*\n"))
assert_raise(SamError) { sam.each }
end
def test_Sam_each_with_ok_rnext_dont_raise
- sam = Sam.new("*\t*\t*\t*\t*\t*\t*\t*\t*\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t*\t*\t*\t*\t*\t*\t*\t*\t*\t*\n"))
assert_nothing_raised { sam.each }
- sam = Sam.new("*\t*\t*\t*\t*\t*\t=\t*\t*\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t*\t*\t*\t*\t*\t=\t*\t*\t*\t*\n"))
assert_nothing_raised { sam.each }
- sam = Sam.new("*\t*\t*\t*\t*\t*\t!\t*\t*\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t*\t*\t*\t*\t*\t!\t*\t*\t*\t*\n"))
assert_nothing_raised { sam.each }
end
def test_Sam_each_with_bad_pnext_raises
- sam = Sam.new("*\t*\t*\t*\t*\t*\t*\t-1\t*\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t*\t*\t*\t*\t*\t*\t-1\t*\t*\t*\n"))
assert_raise(SamError) { sam.each }
- sam = Sam.new("*\t*\t*\t*\t*\t*\t*\t536870912\t*\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t*\t*\t*\t*\t*\t*\t536870912\t*\t*\t*\n"))
assert_raise(SamError) { sam.each }
end
def test_Sam_each_with_ok_pnext_dont_raise
- sam = Sam.new("*\t*\t*\t*\t*\t*\t*\t0\t*\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t*\t*\t*\t*\t*\t*\t0\t*\t*\t*\n"))
assert_nothing_raised { sam.each }
- sam = Sam.new("*\t*\t*\t*\t*\t*\t536870911\t*\t*\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t*\t*\t*\t*\t*\t536870911\t*\t*\t*\t*\n"))
assert_nothing_raised { sam.each }
end
def test_Sam_each_with_bad_tlen_raises
- sam = Sam.new("*\t*\t*\t*\t*\t*\t*\t*\t-536870912\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t*\t*\t*\t*\t*\t*\t*\t-536870912\t*\t*\n"))
assert_raise(SamError) { sam.each }
- sam = Sam.new("*\t*\t*\t*\t*\t*\t*\t*\t536870912\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t*\t*\t*\t*\t*\t*\t*\t536870912\t*\t*\n"))
assert_raise(SamError) { sam.each }
end
def test_Sam_each_with_ok_tlen_dont_raise
- sam = Sam.new("*\t*\t*\t*\t*\t*\t*\t*\t-536870911\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t*\t*\t*\t*\t*\t*\t*\t-536870911\t*\t*\n"))
assert_nothing_raised { sam.each }
- sam = Sam.new("*\t*\t*\t*\t*\t*\t*\t*\t536870911\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t*\t*\t*\t*\t*\t*\t*\t536870911\t*\t*\n"))
assert_nothing_raised { sam.each }
end
def test_Sam_each_with_bad_seq_raises
- sam = Sam.new("*\t*\t*\t*\t*\t*\t*\t*\t\*\t \t*\n")
+ sam = Sam.new(StringIO.new("*\t*\t*\t*\t*\t*\t*\t*\t\*\t \t*\n"))
assert_raise(SamError) { sam.each }
end
def test_Sam_each_with_ok_seq_dont_raise
- sam = Sam.new("*\t*\t*\t*\t*\t*\t*\t*\t\*\t*\t*\n")
+ sam = Sam.new(StringIO.new("*\t*\t*\t*\t*\t*\t*\t*\t\*\t*\t*\n"))
assert_nothing_raised { sam.each }
- sam = Sam.new("*\t*\t*\t*\t*\t*\t*\t*\t\*\tATCGatcg=.\t*\n")
+ sam = Sam.new(StringIO.new("*\t*\t*\t*\t*\t*\t*\t*\t\*\tATCGatcg=.\t*\n"))
assert_nothing_raised { sam.each }
end
def test_Sam_each_with_bad_qual_raises
- sam = Sam.new("*\t*\t*\t*\t*\t*\t*\t*\t\*\t*\t \n")
+ sam = Sam.new(StringIO.new("*\t*\t*\t*\t*\t*\t*\t*\t\*\t*\t \n"))
assert_raise(SamError) { sam.each }
end
def test_Sam_each_with_ok_qual_dont_raise
- sam = Sam.new("*\t*\t*\t*\t*\t*\t*\t*\t\*\t*\t@\n")
- assert_nothing_raised(SamError) { sam.each }
+ sam = Sam.new(StringIO.new("*\t*\t*\t*\t*\t*\t*\t*\t\*\t*\t@\n"))
+ assert_nothing_raised { sam.each }
+ end
+
+ def test_Sam_each_with_rname_missing_from_header_raises
+ sam = Sam.new(StringIO.new("@SQ\tSN:ref\tLN:45\n*\t*\tMIS\t*\t*\t*\t*\t*\t\*\t*\t*\n"))
+ assert_raise(SamError) { sam.each }
+ end
+
+ def test_Sam_each_wtih_rname_present_in_header_dont_raise
+ sam = Sam.new(StringIO.new("@SQ\tSN:ref\tLN:45\n*\t*\tref\t*\t*\t*\t*\t*\t\*\t*\t*\n"))
+ assert_nothing_raised { sam.each }
+
+ sam = Sam.new(StringIO.new("@SQ\tSN:ref\tLN:45\n*\t*\t*\t*\t*\t*\t*\t*\t\*\t*\t*\n"))
+ assert_nothing_raised { sam.each }
end
end