X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=code_ruby%2Flib%2Fmaasha%2Fsam.rb;h=3bd71ac55e2408643ca10ba6d8f3c3ec2786d8e9;hb=ec5a950facae93989c8448f0b9b75e4dafce21ef;hp=b82c6b1b2742f6f3757d1b90c8a598502ec9844b;hpb=9cbef4726264d5f04d5803a7551510e5304f13f8;p=biopieces.git diff --git a/code_ruby/lib/maasha/sam.rb b/code_ruby/lib/maasha/sam.rb index b82c6b1..3bd71ac 100644 --- a/code_ruby/lib/maasha/sam.rb +++ b/code_ruby/lib/maasha/sam.rb @@ -29,17 +29,116 @@ require 'pp' require 'maasha/filesys' require 'maasha/seq' - -# Error class for all exceptions to do with Genbank. -class SamError < StandardError; end +require 'maasha/cigar' REGEX_HEADER = Regexp.new(/^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/) REGEX_COMMENT = Regexp.new(/^@CO\t.*/) +FLAG_MULTI = 0x1 # Template having multiple fragments in sequencing +FLAG_ALIGNED = 0x2 # Each fragment properly aligned according to the aligner +FLAG_UNMAPPED = 0x4 # Fragment unmapped +FLAG_NEXT_UNMAPPED = 0x8 # Next fragment in the template unmapped +FLAG_REVCOMP = 0x10 # SEQ being reverse complemented +FLAG_NEXT_REVCOMP = 0x20 # SEQ of the next fragment in the template being reversed +FLAG_FIRST = 0x40 # The first fragment in the template +FLAG_LAST = 0x80 # The last fragment in the template +FLAG_SECONDARY_ALIGNMENT = 0x100 # Secondary alignment +FLAG_QUALITY_FAIL = 0x200 # Not passing quality controls +FLAG_DUPLICATES = 0x400 # PCR or optical duplicate + +# Error class for all exceptions to do with Genbank. +class SamError < StandardError; end + # Class to parse and write SAM files. class Sam < Filesys attr_accessor :io, :header + # Class method to convert a SAM entry + # to a Biopiece record. + def self.to_bp(sam) + bp = {} + + bp[:REC_TYPE] = 'SAM' + bp[:Q_ID] = sam[:QNAME] + bp[:STRAND] = sam[:FLAG].revcomp? ? '-' : '+' + bp[:S_ID] = sam[:RNAME] + bp[:S_BEG] = sam[:POS] + bp[:MAPQ] = sam[:MAPQ] + bp[:CIGAR] = sam[:CIGAR].to_s + + unless sam[:RNEXT] == '*' + bp[:Q_ID2] = sam[:RNEXT] + bp[:S_BEG2] = sam[:PNEXT] + bp[:TLEN] = sam[:TLEN] + end + + bp[:SEQ] = sam[:SEQ].seq + + unless sam[:SEQ].qual.nil? + bp[:SCORES] = sam[:SEQ].convert_phred2illumina!.qual + end + + if sam.has_key? :NM and sam[:NM].to_i > 0 + bp[:ALIGN] = self.align_descriptors(sam) + end + + bp + end + + # Class method to convert a Biopiece record + # into a SAM entry. + def self.to_sam(bp) + "FISK" # FIXME + end + + # Create alignment descriptors according to the KISS + # format description: + # http://code.google.com/p/biopieces/wiki/KissFormat + def self.align_descriptors(sam) + offset = 0 + align = [] + + # Insertions + sam[:CIGAR].each do |len, op| + if op == 'I' + (0 ... len).each_with_index do |i| + nt = sam[:SEQ].seq[offset + i] + + align << [offset + i, "->#{nt}"] + end + end + + offset += len + end + + offset = 0 + deletions = 0 + + sam[:MD].scan(/\d+|\^[A-Z]+|[A-Z]+/).each do |m| + if m =~ /\d+/ # Matches + offset += m.to_i + elsif m[0] == '^' # Deletions + m.each_char do |nt| + unless nt == '^' + align << [offset, "#{nt}>-"] + deletions += 1 + offset += 1 + end + end + else # Mismatches + m.each_char do |nt| + nt2 = sam[:SEQ].seq[offset - deletions] + + align << [offset, "#{nt}>#{nt2}"] + + offset += 1 + end + end + end + + align.sort_by { |a| a.first }.map { |k, v| "#{k}:#{v}" }.join(",") + end + # Method to initialize a Sam object. def initialize(io = nil) @io = io @@ -253,7 +352,6 @@ class Sam < Filesys check_rname(rname) check_pos(pos) check_mapq(mapq) - check_cigar(cigar, seq) check_rnext(rnext) check_pnext(pnext) check_tlen(tlen) @@ -262,17 +360,40 @@ class Sam < Filesys entry = {} entry[:QNAME] = qname - entry[:FLAG] = flag + entry[:FLAG] = Flag.new(flag) entry[:RNAME] = rname entry[:POS] = pos entry[:MAPQ] = mapq - entry[:CIGAR] = cigar + entry[:CIGAR] = Cigar.new(cigar) entry[:RNEXT] = rnext entry[:PNEXT] = pnext entry[:TLEN] = tlen - entry[:SEQ] = Seq.new(qname, seq) + entry[:SEQ] = (qual == '*') ? Seq.new(qname, seq) : Seq.new(qname, seq, qual) entry[:QUAL] = qual + # Optional fields - where some are really important! HATE HATE HATE SAM!!! + + fields[11 .. -1].each do |field| + tag, type, val = field.split(':') + + raise SamError, "Non-unique optional tag: #{tag}" if entry.has_key? tag.to_sym + + # A [!-~] Printable character + + # i [-+]?[0-9]+ Singed 32-bit integer + if type == 'i' + raise SamError, "Bad tag in optional field: #{field}" unless val =~ /^[-+]?[0-9]+$/ + val = val.to_i + end + + # f [-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)? Single-precision floating number + # Z [ !-~]+ Printable string, including space + # H [0-9A-F]+ Byte array in the Hex format + # B [cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+ Integer or numeric array + + entry[tag.to_sym] = val + end + entry end @@ -309,44 +430,6 @@ class Sam < Filesys raise SamError, "Bad mapq: #{mapq}" unless (0 .. 2**8 - 1).include? mapq end - # Method to check cigar string. - def check_cigar(cigar, seq) - raise SamError, "Bad cigar: #{cigar}" unless cigar =~ /^(\*|([0-9]+[MIDNSHPX=])+)$/ - - unless cigar == '*' - check_cigar_hard_clip(cigar) - check_cigar_soft_clip(cigar) - check_cigar_seq_len(cigar, seq) unless seq == '*' - end - end - - # Method to check cigar hard clipping only at ends. - def check_cigar_hard_clip(cigar) - if cigar.gsub(/^[0-9]+H|[0-9]+H$/, "").match('H') - raise SamError, "Bad cigar with internal H: #{cigar}" - end - end - - # Method to check cigar soft clipping only at ends or H. - def check_cigar_soft_clip(cigar) - if cigar.gsub(/^[0-9]+H|[0-9]+H$/, "").gsub(/^[0-9]+S|[0-9]+S$/, "").match('S') - raise SamError, "Bad cigar with internal S: #{cigar}" - end - end - - # Method to check cigar length matches sequence length. - def check_cigar_seq_len(cigar, seq) - cigar_len = 0 - - cigar.scan(/([0-9]+)([MIDNSHPX=])/).each do |len, op| - cigar_len += len.to_i if op =~ /[MISX=]/ - end - - if cigar_len != seq.length - raise SamError, "cigar and sequence length mismatch: #{cigar_len} != #{seq.length}" - end - end - # Method to check if rnext, when not '*' or '=' # and @SQ header lines are present, is located # in the header hash. @@ -379,6 +462,73 @@ class Sam < Filesys def check_qual(qual) raise SamError, "Bad qual: #{qual}" unless qual =~ /^[!-~]+$/ end + + # Method to deconvolute the SAM flag field. + class Flag + attr_reader :flag + + # Method to initialize a Flag object. + def initialize(flag) + @flag = flag + end + + # Method to test if template have + # multiple fragments in sequencing. + def multi? + flag & FLAG_MULTI + end + + # Method to test if each fragment + # properly aligned according to the aligner. + def aligned? + flag & FLAG_ALIGNED + end + + # Method to test if the fragment was unmapped. + def unmapped? + flag & FLAG_UNMAPPED + end + + # Method to test if the next fragment was unmapped. + def next_unmapped? + flag & FLAG_NEXT_UNMAPPED + end + + # Method to test if the fragment was reverse complemented. + def revcomp? + flag & FLAG_REVCOMP + end + + # Method to test if the next fragment was reverse complemented. + def next_revcomp? + flag & FLAG_NEXT_REVCOMP + end + + # Method to test if the fragment was first in the template. + def first? + flag & FLAG_FIRST + end + + # Method to test if the fragment was last in the template. + def last? + flag & FLAG_LAST + end + + # Method to test for secondary alignment. + def secondary_alignment? + flag & FLAG_SECONDARY_ALIGNMENT + end + + # Method to test for quality fail. + def quality_fail? + flag & FLAG_QUALITY_FAIL + end + + # Method to test for PCR or optical duplicates. + def duplicates? + flag & FLAG_DUPLICATES + end + end end