+# Copyright (C) 2007-2013 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
require 'pp'
require 'maasha/filesys'
require 'maasha/seq'
-
-# Error class for all exceptions to do with Genbank.
-class SamError < StandardError; end
+require 'maasha/cigar'
REGEX_HEADER = Regexp.new(/^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/)
REGEX_COMMENT = Regexp.new(/^@CO\t.*/)
+FLAG_MULTI = 0x1 # Template having multiple fragments in sequencing
+FLAG_ALIGNED = 0x2 # Each fragment properly aligned according to the aligner
+FLAG_UNMAPPED = 0x4 # Fragment unmapped
+FLAG_NEXT_UNMAPPED = 0x8 # Next fragment in the template unmapped
+FLAG_REVCOMP = 0x10 # SEQ being reverse complemented
+FLAG_NEXT_REVCOMP = 0x20 # SEQ of the next fragment in the template being reversed
+FLAG_FIRST = 0x40 # The first fragment in the template
+FLAG_LAST = 0x80 # The last fragment in the template
+FLAG_SECONDARY_ALIGNMENT = 0x100 # Secondary alignment
+FLAG_QUALITY_FAIL = 0x200 # Not passing quality controls
+FLAG_DUPLICATES = 0x400 # PCR or optical duplicate
+
+# Error class for all exceptions to do with Genbank.
+class SamError < StandardError; end
+
# Class to parse and write SAM files.
class Sam < Filesys
- attr_accessor :io
+ attr_accessor :io, :header
+
+ # Class method to convert a SAM entry
+ # to a Biopiece record.
+ def self.to_bp(sam)
+ bp = {}
+
+ bp[:REC_TYPE] = 'SAM'
+ bp[:Q_ID] = sam[:QNAME]
+ bp[:STRAND] = sam[:FLAG].revcomp? ? '-' : '+'
+ bp[:S_ID] = sam[:RNAME]
+ bp[:S_BEG] = sam[:POS]
+ bp[:S_END] = sam[:POS] + sam[:SEQ].length - 1
+ bp[:MAPQ] = sam[:MAPQ]
+ bp[:CIGAR] = sam[:CIGAR].to_s
+
+ unless sam[:RNEXT] == '*'
+ bp[:Q_ID2] = sam[:RNEXT]
+ bp[:S_BEG2] = sam[:PNEXT]
+ bp[:TLEN] = sam[:TLEN]
+ end
+
+ bp[:SEQ] = sam[:SEQ].seq
+
+ unless sam[:SEQ].qual.nil?
+ bp[:SCORES] = sam[:SEQ].convert_phred2illumina!.qual
+ end
+
+ if sam[:NM] and sam[:NM].to_i > 0
+ bp[:NM] = sam[:NM]
+ bp[:MD] = sam[:MD]
+ bp[:ALIGN] = self.align_descriptors(sam)
+ end
+
+ bp
+ end
+
+ # Class method to create a new SAM entry from a Biopiece record.
+ # FIXME
+ def self.new_bp(bp)
+ qname = bp[:Q_ID]
+ flag = 0
+ rname = bp[:S_ID]
+ pos = bp[:S_BEG]
+ mapq = bp[:MAPQ]
+ cigar = bp[:CIGAR]
+ rnext = bp[:Q_ID2] || '*'
+ pnext = bp[:S_BEG2] || 0
+ tlen = bp[:TLEN] || 0
+ seq = bp[:SEQ]
+ qual = bp[:SCORES] || '*'
+ nm = "NM:i:#{bp[:NM]}" if bp[:NM]
+ md = "MD:Z:#{bp[:MD]}" if bp[:MD]
+
+ flag |= FLAG_REVCOMP if bp[:STRAND] == '+'
+
+ if qname && flag && rname && pos && mapq && cigar && rnext && pnext && tlen && seq && qual
+ ary = [qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual]
+ ary << nm if nm
+ ary << md if md
+
+ ary.join("\t")
+ end
+ end
+
+ # Create alignment descriptors according to the KISS
+ # format description:
+ # http://code.google.com/p/biopieces/wiki/KissFormat
+ def self.align_descriptors(sam)
+ offset = 0
+ align = []
+
+ # Insertions
+ sam[:CIGAR].each do |len, op|
+ if op == 'I'
+ (0 ... len).each_with_index do |i|
+ nt = sam[:SEQ].seq[offset + i]
+
+ align << [offset + i, "->#{nt}"]
+ end
+ end
+
+ offset += len
+ end
+
+ offset = 0
+ deletions = 0
+
+ sam[:MD].scan(/\d+|\^[A-Z]+|[A-Z]+/).each do |m|
+ if m =~ /\d+/ # Matches
+ offset += m.to_i
+ elsif m[0] == '^' # Deletions
+ m.each_char do |nt|
+ unless nt == '^'
+ align << [offset, "#{nt}>-"]
+ deletions += 1
+ offset += 1
+ end
+ end
+ else # Mismatches
+ m.each_char do |nt|
+ nt2 = sam[:SEQ].seq[offset - deletions]
+
+ align << [offset, "#{nt}>#{nt2}"]
+
+ offset += 1
+ end
+ end
+ end
+
+ align.sort_by { |a| a.first }.map { |k, v| "#{k}:#{v}" }.join(",")
+ end
# Method to initialize a Sam object.
def initialize(io = nil)
- @io = io
- @header_hash = {}
+ @io = io
+ @header = {}
+
+ parse_header
+ end
+
+ def each
+ @io.each_line do |line|
+ unless line[0] == '@'
+ entry = parse_alignment(line.chomp)
+
+ yield entry if block_given?
+ end
+ end
end
- # Method to parse the header of a SAM file.
+ private
+
+ # Method to parse the header section of a SAM file.
# Each header line should match:
# /^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ or /^@CO\t.*/.
# Tags containing lowercase letters are reserved for end users.
- def header
+ def parse_header
@io.each_line do |line|
if line =~ /^@([A-Za-z][A-Za-z])/
line.chomp!
tag = $1
case tag
- when 'HD' then parse_header(line)
- when 'SQ' then parse_sequence(line)
- when 'RG' then parse_read_group(line)
- when 'PG' then parse_program(line)
- when 'CO' then parse_comment(line)
+ when 'HD' then subparse_header(line)
+ when 'SQ' then subparse_sequence(line)
+ when 'RG' then subparse_read_group(line)
+ when 'PG' then subparse_program(line)
+ when 'CO' then subparse_comment(line)
else
raise SamError, "Unknown header tag: #{tag}"
end
end
end
- return @header_hash.empty? ? nil : @header_hash
+ return @header.empty? ? nil : @header
end
- def each
- @io.each_line do |line|
- unless line[0] == '@'
- entry = parse_alignment(line.chomp)
-
- yield entry if block_given?
- end
- end
- end
-
- private
-
# Method to subparse header lines.
- def parse_header(line)
+ def subparse_header(line)
hash = {}
fields = line.split("\t")
end
end
- @header_hash[:HD] = hash
+ @header[:HD] = hash
end
# Method to subparse sequence lines.
- def parse_sequence(line)
- @header_hash[:SQ] = Hash.new unless @header_hash[:SQ].is_a? Hash
+ def subparse_sequence(line)
+ @header[:SQ] = Hash.new unless @header[:SQ].is_a? Hash
hash = {}
fields = line.split("\t")
end
end
- @header_hash[:SQ][:SN] = Hash.new unless @header_hash[:SQ][:SN].is_a? Hash
+ @header[:SQ][:SN] = Hash.new unless @header[:SQ][:SN].is_a? Hash
- if @header_hash[:SQ][:SN].has_key? seq_name
+ if @header[:SQ][:SN][seq_name]
raise SamError, "Non-unique sequence name: #{seq_name}"
else
- @header_hash[:SQ][:SN][seq_name] = hash
+ @header[:SQ][:SN][seq_name] = hash
end
end
# Method to subparse read group lines.
- def parse_read_group(line)
- @header_hash[:RG] = Hash.new unless @header_hash[:RG].is_a? Hash
+ def subparse_read_group(line)
+ @header[:RG] = Hash.new unless @header[:RG].is_a? Hash
hash = {}
fields = line.split("\t")
end
end
- if hash.has_key? :FO
+ if hash[:FO]
unless hash[:FO] =~ /^\*|[ACMGRSVTWYHKDBN]+$/
raise SamError, "Bad flow order: #{hash[:FO]}"
end
end
- if hash.has_key? :PL
+ if hash[:PL]
unless hash[:PL] =~ /^(CAPILLARY|LS454|ILLUMINA|SOLID|HELICOS|IONTORRENT|PACBIO)$/
raise SamError, "Bad platform: #{hash[:PL]}"
end
end
- @header_hash[:RG][:ID] = Hash.new unless @header_hash[:RG][:ID].is_a? Hash
+ @header[:RG][:ID] = Hash.new unless @header[:RG][:ID].is_a? Hash
- if @header_hash[:RG][:ID].has_key? id
+ if @header[:RG][:ID][id]
raise SamError, "Non-unique read group identifier: #{id}"
else
- @header_hash[:RG][:ID][id] = hash
+ @header[:RG][:ID][id] = hash
end
end
# Method to subparse program lines.
- def parse_program(line)
- @header_hash[:PG] = Hash.new unless @header_hash[:PG].is_a? Hash
+ def subparse_program(line)
+ @header[:PG] = Hash.new unless @header[:PG].is_a? Hash
hash = {}
fields = line.split("\t")
end
end
- @header_hash[:PG][:ID] = Hash.new unless @header_hash[:PG][:ID].is_a? Hash
+ @header[:PG][:ID] = Hash.new unless @header[:PG][:ID].is_a? Hash
- if @header_hash[:PG][:ID].has_key? id
+ if @header[:PG][:ID][id]
raise SamError, "Non-unique program record identifier: #{id}"
else
- @header_hash[:PG][:ID][id] = hash
+ @header[:PG][:ID][id] = hash
end
end
# Method to subparse comment lines.
- def parse_comment(line)
- @header_hash[:CO] = Array.new unless @header_hash[:CO].is_a? Array
+ def subparse_comment(line)
+ @header[:CO] = Array.new unless @header[:CO].is_a? Array
if line =~ /^@CO\t(.+)/
- @header_hash[:CO] << $1
+ @header[:CO] << $1
else
raise SamError, "Bad comment line: #{line}"
end
seq = fields[9]
qual = fields[10]
- raise SamError, "Bad qname: #{qname}" unless qname =~ /^[!-?A-~]{1,255}$/
- raise SamError, "Bad flag: #{flag}" unless (0 .. 2**16 - 1).include? flag
- raise SamError, "Bad rname: #{rname}" unless rname =~ /^(\*|[!-()+-<>-~][!-~]*)$/
- raise SamError, "Bad pos: #{pos}" unless (0 .. 2**29 - 1).include? pos
- raise SamError, "Bad mapq: #{mapq}" unless (0 .. 2**8 - 1).include? mapq
- raise SamError, "Bad cigar: #{cigar}" unless cigar =~ /^(\*|([0-9]+[MIDNSHPX=])+)$/
- raise SamError, "Bad rnext: #{rnext}" unless rnext =~ /^(\*|=|[!-()+-<>-~][!-~]*)$/
- raise SamError, "Bad pnext: #{pnext}" unless (0 .. 2**29 - 1).include? pnext
- raise SamError, "Bad tlen: #{tlen}" unless (-2**29 + 1 .. 2**29 - 1).include? tlen
- raise SamError, "Bad seq: #{seq}" unless seq =~ /^(\*|[A-Za-z=.]+)$/
- raise SamError, "Bad qual: #{qual}" unless qual =~ /^[!-~]+$/
+ check_qname(qname)
+ check_flag(flag)
+ check_rname(rname)
+ check_pos(pos)
+ check_mapq(mapq)
+ check_rnext(rnext)
+ check_pnext(pnext)
+ check_tlen(tlen)
+ check_seq(seq)
+ check_qual(qual)
entry = {}
entry[:QNAME] = qname
- entry[:FLAG] = flag
+ entry[:FLAG] = Flag.new(flag)
entry[:RNAME] = rname
entry[:POS] = pos
entry[:MAPQ] = mapq
- entry[:CIGAR] = cigar
+ entry[:CIGAR] = Cigar.new(cigar)
entry[:RNEXT] = rnext
entry[:PNEXT] = pnext
entry[:TLEN] = tlen
- entry[:SEQ] = seq
+ entry[:SEQ] = (qual == '*') ? Seq.new(qname, seq) : Seq.new(qname, seq, qual)
entry[:QUAL] = qual
+ # Optional fields - where some are really important! HATE HATE HATE SAM!!!
+
+ fields[11 .. -1].each do |field|
+ tag, type, val = field.split(':')
+
+ raise SamError, "Non-unique optional tag: #{tag}" if entry[tag.to_sym]
+
+ # A [!-~] Printable character
+
+ # i [-+]?[0-9]+ Singed 32-bit integer
+ if type == 'i'
+ raise SamError, "Bad tag in optional field: #{field}" unless val =~ /^[-+]?[0-9]+$/
+ val = val.to_i
+ end
+
+ # f [-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)? Single-precision floating number
+ # Z [ !-~]+ Printable string, including space
+ # H [0-9A-F]+ Byte array in the Hex format
+ # B [cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+ Integer or numeric array
+
+ entry[tag.to_sym] = val
+ end
+
entry
end
+
+ # Method to check qname.
+ def check_qname(qname)
+ raise SamError, "Bad qname: #{qname}" unless qname =~ /^[!-?A-~]{1,255}$/
+ end
+
+ # Method to check flag.
+ def check_flag(flag)
+ raise SamError, "Bad flag: #{flag}" unless (0 .. 2**16 - 1).include? flag
+ end
+
+ # Method to check if rname, when not '*' and
+ # @SQ header lines are present, is located in
+ # the header hash.
+ def check_rname(rname)
+ raise SamError, "Bad rname: #{rname}" unless rname =~ /^(\*|[!-()+-<>-~][!-~]*)$/
+
+ unless @header.empty? or rname == '*'
+ unless @header[:SQ][:SN][rname.to_sym]
+ raise SamError, "rname not found in header hash: #{rname}"
+ end
+ end
+ end
+
+ # Method to check pos.
+ def check_pos(pos)
+ raise SamError, "Bad pos: #{pos}" unless (0 .. 2**29 - 1).include? pos
+ end
+
+ # Method to check mapq.
+ def check_mapq(mapq)
+ raise SamError, "Bad mapq: #{mapq}" unless (0 .. 2**8 - 1).include? mapq
+ end
+
+ # Method to check if rnext, when not '*' or '='
+ # and @SQ header lines are present, is located
+ # in the header hash.
+ def check_rnext(rnext)
+ raise SamError, "Bad rnext: #{rnext}" unless rnext =~ /^(\*|=|[!-()+-<>-~][!-~]*)$/
+
+ unless @header.empty? or rnext == '*' or rnext == '='
+ unless @header[:SQ][:SN][rnext.to_sym]
+ raise SamError, "rnext not found in header hash: #{rnext}"
+ end
+ end
+ end
+
+ # Method to check pnext.
+ def check_pnext(pnext)
+ raise SamError, "Bad pnext: #{pnext}" unless (0 .. 2**29 - 1).include? pnext
+ end
+
+ # Method to check tlen.
+ def check_tlen(tlen)
+ raise SamError, "Bad tlen: #{tlen}" unless (-2**29 + 1 .. 2**29 - 1).include? tlen
+ end
+
+ # Method to check seq.
+ def check_seq(seq)
+ raise SamError, "Bad seq: #{seq}" unless seq =~ /^(\*|[A-Za-z=.]+)$/
+ end
+
+ # Method to check qual.
+ def check_qual(qual)
+ raise SamError, "Bad qual: #{qual}" unless qual =~ /^[!-~]+$/
+ end
+
+ # Method to deconvolute the SAM flag field.
+ class Flag
+ attr_reader :flag
+
+ # Method to initialize a Flag object.
+ def initialize(flag)
+ @flag = flag
+ end
+
+ # Method to test if template have
+ # multiple fragments in sequencing.
+ def multi?
+ (flag & FLAG_MULTI) == 0
+ end
+
+ # Method to test if each fragment
+ # properly aligned according to the aligner.
+ def aligned?
+ (flag & FLAG_ALIGNED) == 0
+ end
+
+ # Method to test if the fragment was unmapped.
+ def unmapped?
+ (flag & FLAG_UNMAPPED) == 0
+ end
+
+ # Method to test if the next fragment was unmapped.
+ def next_unmapped?
+ (flag & FLAG_NEXT_UNMAPPED) == 0
+ end
+
+ # Method to test if the fragment was reverse complemented.
+ def revcomp?
+ (flag & FLAG_REVCOMP) == 0
+ end
+
+ # Method to test if the next fragment was reverse complemented.
+ def next_revcomp?
+ (flag & FLAG_NEXT_REVCOMP) == 0
+ end
+
+ # Method to test if the fragment was first in the template.
+ def first?
+ (flag & FLAG_FIRST) == 0
+ end
+
+ # Method to test if the fragment was last in the template.
+ def last?
+ (flag & FLAG_LAST) == 0
+ end
+
+ # Method to test for secondary alignment.
+ def secondary_alignment?
+ (flag & FLAG_SECONDARY_ALIGNMENT) == 0
+ end
+
+ # Method to test for quality fail.
+ def quality_fail?
+ (flag & FLAG_QUALITY_FAIL) == 0
+ end
+
+ # Method to test for PCR or optical duplicates.
+ def duplicates?
+ (flag & FLAG_DUPLICATES) == 0
+ end
+ end
end