X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=code_ruby%2Flib%2Fmaasha%2Fsam.rb;h=8772e6382f20913d2fca3c9076b90475d2824564;hb=a80169a9121e8537f169cd85010d2ceae3a8d4fd;hp=67e950c082347425865b3745dc5cb8f2a4a5d95d;hpb=b5ff429fe5afe30aae090e313b015c29039027c1;p=biopieces.git diff --git a/code_ruby/lib/maasha/sam.rb b/code_ruby/lib/maasha/sam.rb index 67e950c..8772e63 100644 --- a/code_ruby/lib/maasha/sam.rb +++ b/code_ruby/lib/maasha/sam.rb @@ -1,4 +1,4 @@ -# Copyright (C) 2007-2011 Martin A. Hansen. +# Copyright (C) 2007-2013 Martin A. Hansen. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -29,28 +29,166 @@ require 'pp' require 'maasha/filesys' require 'maasha/seq' - -# Error class for all exceptions to do with Genbank. -class SamError < StandardError; end +require 'maasha/cigar' REGEX_HEADER = Regexp.new(/^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/) REGEX_COMMENT = Regexp.new(/^@CO\t.*/) +FLAG_MULTI = 0x1 # Template having multiple fragments in sequencing +FLAG_ALIGNED = 0x2 # Each fragment properly aligned according to the aligner +FLAG_UNMAPPED = 0x4 # Fragment unmapped +FLAG_NEXT_UNMAPPED = 0x8 # Next fragment in the template unmapped +FLAG_REVCOMP = 0x10 # SEQ being reverse complemented +FLAG_NEXT_REVCOMP = 0x20 # SEQ of the next fragment in the template being reversed +FLAG_FIRST = 0x40 # The first fragment in the template +FLAG_LAST = 0x80 # The last fragment in the template +FLAG_SECONDARY_ALIGNMENT = 0x100 # Secondary alignment +FLAG_QUALITY_FAIL = 0x200 # Not passing quality controls +FLAG_DUPLICATES = 0x400 # PCR or optical duplicate + +# Error class for all exceptions to do with Genbank. +class SamError < StandardError; end + # Class to parse and write SAM files. class Sam < Filesys - attr_accessor :io + attr_accessor :io, :header + + # Class method to convert a SAM entry + # to a Biopiece record. + def self.to_bp(sam) + bp = {} + + bp[:REC_TYPE] = 'SAM' + bp[:Q_ID] = sam[:QNAME] + bp[:STRAND] = sam[:FLAG].revcomp? ? '-' : '+' + bp[:S_ID] = sam[:RNAME] + bp[:S_BEG] = sam[:POS] + bp[:S_END] = sam[:POS] + sam[:SEQ].length - 1 + bp[:MAPQ] = sam[:MAPQ] + bp[:CIGAR] = sam[:CIGAR].to_s + + unless sam[:RNEXT] == '*' + bp[:Q_ID2] = sam[:RNEXT] + bp[:S_BEG2] = sam[:PNEXT] + bp[:TLEN] = sam[:TLEN] + end + + bp[:SEQ] = sam[:SEQ].seq + + unless sam[:SEQ].qual.nil? + bp[:SCORES] = sam[:SEQ].qual_convert!(:base_33, :base_64).qual + end + + if sam[:NM] and sam[:NM].to_i > 0 + bp[:NM] = sam[:NM] + bp[:MD] = sam[:MD] + bp[:ALIGN] = self.align_descriptors(sam) + end + + bp + end + + # Class method to create a new SAM entry from a Biopiece record. + # FIXME + def self.new_bp(bp) + qname = bp[:Q_ID] + flag = 0 + rname = bp[:S_ID] + pos = bp[:S_BEG] + mapq = bp[:MAPQ] + cigar = bp[:CIGAR] + rnext = bp[:Q_ID2] || '*' + pnext = bp[:S_BEG2] || 0 + tlen = bp[:TLEN] || 0 + seq = bp[:SEQ] + qual = bp[:SCORES] || '*' + nm = "NM:i:#{bp[:NM]}" if bp[:NM] + md = "MD:Z:#{bp[:MD]}" if bp[:MD] + + flag |= FLAG_REVCOMP if bp[:STRAND] == '+' + + if qname && flag && rname && pos && mapq && cigar && rnext && pnext && tlen && seq && qual + ary = [qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual] + ary << nm if nm + ary << md if md + + ary.join("\t") + end + end + + # Create alignment descriptors according to the KISS + # format description: + # http://code.google.com/p/biopieces/wiki/KissFormat + def self.align_descriptors(sam) + offset = 0 + align = [] + + # Insertions + sam[:CIGAR].each do |len, op| + if op == 'I' + (0 ... len).each_with_index do |i| + nt = sam[:SEQ].seq[offset + i] + + align << [offset + i, "->#{nt}"] + end + end + + offset += len + end + + offset = 0 + deletions = 0 + + sam[:MD].scan(/\d+|\^[A-Z]+|[A-Z]+/).each do |m| + if m =~ /\d+/ # Matches + offset += m.to_i + elsif m[0] == '^' # Deletions + m.each_char do |nt| + unless nt == '^' + align << [offset, "#{nt}>-"] + deletions += 1 + offset += 1 + end + end + else # Mismatches + m.each_char do |nt| + nt2 = sam[:SEQ].seq[offset - deletions] + + align << [offset, "#{nt}>#{nt2}"] + + offset += 1 + end + end + end + + align.sort_by { |a| a.first }.map { |k, v| "#{k}:#{v}" }.join(",") + end # Method to initialize a Sam object. def initialize(io = nil) - @io = io - @header_hash = {} + @io = io + @header = {} + + parse_header + end + + def each + @io.each_line do |line| + unless line[0] == '@' + entry = parse_alignment(line.chomp) + + yield entry if block_given? + end + end end - # Method to parse the header of a SAM file. + private + + # Method to parse the header section of a SAM file. # Each header line should match: # /^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ or /^@CO\t.*/. # Tags containing lowercase letters are reserved for end users. - def header + def parse_header @io.each_line do |line| if line =~ /^@([A-Za-z][A-Za-z])/ line.chomp! @@ -58,11 +196,11 @@ class Sam < Filesys tag = $1 case tag - when 'HD' then parse_header(line) - when 'SQ' then parse_sequence(line) - when 'RG' then parse_read_group(line) - when 'PG' then parse_program(line) - when 'CO' then parse_comment(line) + when 'HD' then subparse_header(line) + when 'SQ' then subparse_sequence(line) + when 'RG' then subparse_read_group(line) + when 'PG' then subparse_program(line) + when 'CO' then subparse_comment(line) else raise SamError, "Unknown header tag: #{tag}" end @@ -72,23 +210,11 @@ class Sam < Filesys end end - return @header_hash.empty? ? nil : @header_hash + return @header.empty? ? nil : @header end - def each - @io.each_line do |line| - unless line[0] == '@' - entry = parse_alignment(line.chomp) - - yield entry if block_given? - end - end - end - - private - # Method to subparse header lines. - def parse_header(line) + def subparse_header(line) hash = {} fields = line.split("\t") @@ -106,12 +232,12 @@ class Sam < Filesys end end - @header_hash[:HD] = hash + @header[:HD] = hash end # Method to subparse sequence lines. - def parse_sequence(line) - @header_hash[:SQ] = Hash.new unless @header_hash[:SQ].is_a? Hash + def subparse_sequence(line) + @header[:SQ] = Hash.new unless @header[:SQ].is_a? Hash hash = {} fields = line.split("\t") @@ -136,18 +262,18 @@ class Sam < Filesys end end - @header_hash[:SQ][:SN] = Hash.new unless @header_hash[:SQ][:SN].is_a? Hash + @header[:SQ][:SN] = Hash.new unless @header[:SQ][:SN].is_a? Hash - if @header_hash[:SQ][:SN].has_key? seq_name + if @header[:SQ][:SN][seq_name] raise SamError, "Non-unique sequence name: #{seq_name}" else - @header_hash[:SQ][:SN][seq_name] = hash + @header[:SQ][:SN][seq_name] = hash end end # Method to subparse read group lines. - def parse_read_group(line) - @header_hash[:RG] = Hash.new unless @header_hash[:RG].is_a? Hash + def subparse_read_group(line) + @header[:RG] = Hash.new unless @header[:RG].is_a? Hash hash = {} fields = line.split("\t") @@ -166,30 +292,30 @@ class Sam < Filesys end end - if hash.has_key? :FO + if hash[:FO] unless hash[:FO] =~ /^\*|[ACMGRSVTWYHKDBN]+$/ raise SamError, "Bad flow order: #{hash[:FO]}" end end - if hash.has_key? :PL + if hash[:PL] unless hash[:PL] =~ /^(CAPILLARY|LS454|ILLUMINA|SOLID|HELICOS|IONTORRENT|PACBIO)$/ raise SamError, "Bad platform: #{hash[:PL]}" end end - @header_hash[:RG][:ID] = Hash.new unless @header_hash[:RG][:ID].is_a? Hash + @header[:RG][:ID] = Hash.new unless @header[:RG][:ID].is_a? Hash - if @header_hash[:RG][:ID].has_key? id + if @header[:RG][:ID][id] raise SamError, "Non-unique read group identifier: #{id}" else - @header_hash[:RG][:ID][id] = hash + @header[:RG][:ID][id] = hash end end # Method to subparse program lines. - def parse_program(line) - @header_hash[:PG] = Hash.new unless @header_hash[:PG].is_a? Hash + def subparse_program(line) + @header[:PG] = Hash.new unless @header[:PG].is_a? Hash hash = {} fields = line.split("\t") @@ -208,21 +334,21 @@ class Sam < Filesys end end - @header_hash[:PG][:ID] = Hash.new unless @header_hash[:PG][:ID].is_a? Hash + @header[:PG][:ID] = Hash.new unless @header[:PG][:ID].is_a? Hash - if @header_hash[:PG][:ID].has_key? id + if @header[:PG][:ID][id] raise SamError, "Non-unique program record identifier: #{id}" else - @header_hash[:PG][:ID][id] = hash + @header[:PG][:ID][id] = hash end end # Method to subparse comment lines. - def parse_comment(line) - @header_hash[:CO] = Array.new unless @header_hash[:CO].is_a? Array + def subparse_comment(line) + @header[:CO] = Array.new unless @header[:CO].is_a? Array if line =~ /^@CO\t(.+)/ - @header_hash[:CO] << $1 + @header[:CO] << $1 else raise SamError, "Bad comment line: #{line}" end @@ -246,33 +372,188 @@ class Sam < Filesys seq = fields[9] qual = fields[10] - raise SamError, "Bad qname: #{qname}" unless qname =~ /^[!-?A-~]{1,255}$/ - raise SamError, "Bad flag: #{flag}" unless (0 .. 2**16 - 1).include? flag - raise SamError, "Bad rname: #{rname}" unless rname =~ /^(\*|[!-()+-<>-~][!-~]*)$/ - raise SamError, "Bad pos: #{pos}" unless (0 .. 2**29 - 1).include? pos - raise SamError, "Bad mapq: #{mapq}" unless (0 .. 2**8 - 1).include? mapq - raise SamError, "Bad cigar: #{cigar}" unless cigar =~ /^(\*|([0-9]+[MIDNSHPX=])+)$/ - raise SamError, "Bad rnext: #{rnext}" unless rnext =~ /^(\*|=|[!-()+-<>-~][!-~]*)$/ - raise SamError, "Bad pnext: #{pnext}" unless (0 .. 2**29 - 1).include? pnext - raise SamError, "Bad tlen: #{tlen}" unless (-2**29 + 1 .. 2**29 - 1).include? tlen - raise SamError, "Bad seq: #{seq}" unless seq =~ /^(\*|[A-Za-z=.]+)$/ - raise SamError, "Bad qual: #{qual}" unless qual =~ /^[!-~]+$/ + check_qname(qname) + check_flag(flag) + check_rname(rname) + check_pos(pos) + check_mapq(mapq) + check_rnext(rnext) + check_pnext(pnext) + check_tlen(tlen) + check_seq(seq) + check_qual(qual) entry = {} entry[:QNAME] = qname - entry[:FLAG] = flag + entry[:FLAG] = Flag.new(flag) entry[:RNAME] = rname entry[:POS] = pos entry[:MAPQ] = mapq - entry[:CIGAR] = cigar + entry[:CIGAR] = Cigar.new(cigar) entry[:RNEXT] = rnext entry[:PNEXT] = pnext entry[:TLEN] = tlen - entry[:SEQ] = seq + entry[:SEQ] = (qual == '*') ? Seq.new(seq_name: qname, seq: seq) : Seq.new(seq_name: qname, seq: seq, qual: qual) entry[:QUAL] = qual + # Optional fields - where some are really important! HATE HATE HATE SAM!!! + + fields[11 .. -1].each do |field| + tag, type, val = field.split(':') + + raise SamError, "Non-unique optional tag: #{tag}" if entry[tag.to_sym] + + # A [!-~] Printable character + + # i [-+]?[0-9]+ Singed 32-bit integer + if type == 'i' + raise SamError, "Bad tag in optional field: #{field}" unless val =~ /^[-+]?[0-9]+$/ + val = val.to_i + end + + # f [-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)? Single-precision floating number + # Z [ !-~]+ Printable string, including space + # H [0-9A-F]+ Byte array in the Hex format + # B [cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+ Integer or numeric array + + entry[tag.to_sym] = val + end + entry end + + # Method to check qname. + def check_qname(qname) + raise SamError, "Bad qname: #{qname}" unless qname =~ /^[!-?A-~]{1,255}$/ + end + + # Method to check flag. + def check_flag(flag) + raise SamError, "Bad flag: #{flag}" unless (0 .. 2**16 - 1).include? flag + end + + # Method to check if rname, when not '*' and + # @SQ header lines are present, is located in + # the header hash. + def check_rname(rname) + raise SamError, "Bad rname: #{rname}" unless rname =~ /^(\*|[!-()+-<>-~][!-~]*)$/ + + unless @header.empty? or rname == '*' + unless @header[:SQ][:SN][rname.to_sym] + raise SamError, "rname not found in header hash: #{rname}" + end + end + end + + # Method to check pos. + def check_pos(pos) + raise SamError, "Bad pos: #{pos}" unless (0 .. 2**29 - 1).include? pos + end + + # Method to check mapq. + def check_mapq(mapq) + raise SamError, "Bad mapq: #{mapq}" unless (0 .. 2**8 - 1).include? mapq + end + + # Method to check if rnext, when not '*' or '=' + # and @SQ header lines are present, is located + # in the header hash. + def check_rnext(rnext) + raise SamError, "Bad rnext: #{rnext}" unless rnext =~ /^(\*|=|[!-()+-<>-~][!-~]*)$/ + + unless @header.empty? or rnext == '*' or rnext == '=' + unless @header[:SQ][:SN][rnext.to_sym] + raise SamError, "rnext not found in header hash: #{rnext}" + end + end + end + + # Method to check pnext. + def check_pnext(pnext) + raise SamError, "Bad pnext: #{pnext}" unless (0 .. 2**29 - 1).include? pnext + end + + # Method to check tlen. + def check_tlen(tlen) + raise SamError, "Bad tlen: #{tlen}" unless (-2**29 + 1 .. 2**29 - 1).include? tlen + end + + # Method to check seq. + def check_seq(seq) + raise SamError, "Bad seq: #{seq}" unless seq =~ /^(\*|[A-Za-z=.]+)$/ + end + + # Method to check qual. + def check_qual(qual) + raise SamError, "Bad qual: #{qual}" unless qual =~ /^[!-~]+$/ + end + + # Method to deconvolute the SAM flag field. + class Flag + attr_reader :flag + + # Method to initialize a Flag object. + def initialize(flag) + @flag = flag + end + + # Method to test if template have + # multiple fragments in sequencing. + def multi? + (flag & FLAG_MULTI) == 0 + end + + # Method to test if each fragment + # properly aligned according to the aligner. + def aligned? + (flag & FLAG_ALIGNED) == 0 + end + + # Method to test if the fragment was unmapped. + def unmapped? + (flag & FLAG_UNMAPPED) == 0 + end + + # Method to test if the next fragment was unmapped. + def next_unmapped? + (flag & FLAG_NEXT_UNMAPPED) == 0 + end + + # Method to test if the fragment was reverse complemented. + def revcomp? + (flag & FLAG_REVCOMP) == 0 + end + + # Method to test if the next fragment was reverse complemented. + def next_revcomp? + (flag & FLAG_NEXT_REVCOMP) == 0 + end + + # Method to test if the fragment was first in the template. + def first? + (flag & FLAG_FIRST) == 0 + end + + # Method to test if the fragment was last in the template. + def last? + (flag & FLAG_LAST) == 0 + end + + # Method to test for secondary alignment. + def secondary_alignment? + (flag & FLAG_SECONDARY_ALIGNMENT) == 0 + end + + # Method to test for quality fail. + def quality_fail? + (flag & FLAG_QUALITY_FAIL) == 0 + end + + # Method to test for PCR or optical duplicates. + def duplicates? + (flag & FLAG_DUPLICATES) == 0 + end + end end