From: martinahansen Date: Mon, 22 Aug 2011 20:12:22 +0000 (+0000) Subject: worked on sam.rb X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=94a4251e75ccf8a5c697654d58922c47bde10520;p=biopieces.git worked on sam.rb git-svn-id: http://biopieces.googlecode.com/svn/trunk@1489 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/code_ruby/lib/maasha/sam.rb b/code_ruby/lib/maasha/sam.rb index b45f039..4a6670d 100644 --- a/code_ruby/lib/maasha/sam.rb +++ b/code_ruby/lib/maasha/sam.rb @@ -32,21 +32,25 @@ class SamError < StandardError; end REGEX_HEADER = Regexp.new(/^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/) REGEX_COMMENT = Regexp.new(/^@CO\t.*/) +# Class to parse and write SAM files. class Sam < Filesys attr_accessor :io + # Method to initialize a Sam object. def initialize(io = nil) @io = io @header_hash = {} end - # Each header line should match: /^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ or /^@CO\t.*/. + # Method to parse the header of a SAM file. + # Each header line should match: + # /^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ or /^@CO\t.*/. # Tags containing lowercase letters are reserved for end users. def header @io.each_line do |line| - line.chomp! - if line =~ /^@([A-Za-z][A-Za-z])/ + line.chomp! + tag = $1 case tag @@ -67,8 +71,19 @@ class Sam < Filesys return @header_hash.empty? ? nil : @header_hash end + def each + @io.each_line do |line| + unless line[0] == '@' + entry = parse_alignment(line.chomp) + + yield entry if block_given? + end + end + end + private + # Method to subparse header lines. def parse_header(line) hash = {} fields = line.split("\t") @@ -90,6 +105,7 @@ class Sam < Filesys @header_hash[:HD] = hash end + # Method to subparse sequence lines. def parse_sequence(line) @header_hash[:SQ] = Hash.new unless @header_hash[:SQ].is_a? Hash hash = {} @@ -125,6 +141,7 @@ class Sam < Filesys end end + # Method to subparse read group lines. def parse_read_group(line) @header_hash[:RG] = Hash.new unless @header_hash[:RG].is_a? Hash hash = {} @@ -166,6 +183,7 @@ class Sam < Filesys end end + # Method to subparse program lines. def parse_program(line) @header_hash[:PG] = Hash.new unless @header_hash[:PG].is_a? Hash hash = {} @@ -195,6 +213,7 @@ class Sam < Filesys end end + # Method to subparse comment lines. def parse_comment(line) @header_hash[:CO] = Array.new unless @header_hash[:CO].is_a? Array @@ -205,7 +224,50 @@ class Sam < Filesys end end - def get_entry + # Method to subparse alignment lines. + def parse_alignment(line) + fields = line.split("\t") + + raise SamError, "Bad number of fields: #{fields.size} < 11" if fields.size < 11 + + qname = fields[0] + flag = fields[1].to_i + rname = fields[2] + pos = fields[3].to_i + mapq = fields[4].to_i + cigar = fields[5] + rnext = fields[6] + pnext = fields[7].to_i + tlen = fields[8].to_i + seq = fields[9] + qual = fields[10] + + raise SamError, "Bad qname: #{qname}" unless qname =~ /^[!-?A-~]{1,255}$/ + raise SamError, "Bad flag: #{flag}" unless (0 .. 2**16 - 1).include? flag + raise SamError, "Bad rname: #{rname}" unless rname =~ /^(\*|[!-()+-<>-~][!-~]*)$/ + raise SamError, "Bad pos: #{pos}" unless (0 .. 2**29 - 1).include? pos + raise SamError, "Bad mapq: #{mapq}" unless (0 .. 2**8 - 1).include? mapq + raise SamError, "Bad cigar: #{cigar}" unless cigar =~ /^(\*|([0-9]+[MIDNSHPX=])+)$/ + raise SamError, "Bad rnext: #{rnext}" unless rnext =~ /^(\*|=|[!-()+-<>-~][!-~]*)$/ + raise SamError, "Bad pnext: #{pnext}" unless (0 .. 2**29 - 1).include? pnext + raise SamError, "Bad tlen: #{tlen}" unless (-2**29 + 1 .. 2**29 - 1).include? tlen + raise SamError, "Bad seq: #{seq}" unless seq =~ /^(\*|[A-Za-z=.]+)$/ + raise SamError, "Bad qual: #{qual}" unless qual =~ /^[!-~]+$/ + + entry = {} + entry[:QNAME] = qname + entry[:FLAG] = flag + entry[:RNAME] = rname + entry[:POS] = pos + entry[:MAPQ] = mapq + entry[:CIGAR] = cigar + entry[:RNEXT] = rnext + entry[:PNEXT] = pnext + entry[:TLEN] = tlen + entry[:SEQ] = seq + entry[:QUAL] = qual + + entry end end diff --git a/code_ruby/test/maasha/test_sam.rb b/code_ruby/test/maasha/test_sam.rb index dc384cb..ee3c346 100755 --- a/code_ruby/test/maasha/test_sam.rb +++ b/code_ruby/test/maasha/test_sam.rb @@ -184,5 +184,26 @@ class SamTest < Test::Unit::TestCase sam = Sam.new(StringIO.new("@CO\tfubar")) assert_nothing_raised { sam.header } end + + def test_Sam_each_with_bad_field_count_raises + fields = [] + + (0 ... 11).each do |i| + sam = Sam.new(StringIO.new(fields.join("\t") + $/)) + assert_raise(SamError) { sam.each } + fields << "*" + end + end + + def test_Sam_each_with_ok_field_count_dont_raise + sam = Sam.new(SAM_TEST) + assert_nothing_raised { sam.each } + end + +# def test_Sam_each_with_bad_qname_raises +# end + +# def test_Sam_each_with_ok_qname_dont_raise +# end end