X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=code_ruby%2Flib%2Fmaasha%2Fsam.rb;h=cf5acda4683c12bb63407d02b0abbdbc9318a37e;hb=27a34d675d07468851f2076a62ca2b8dbefb5923;hp=be4e9f4f929c2f015825b37c67c12525aa7b9156;hpb=07d51da11f30e072f586e520671f20643143d734;p=biopieces.git diff --git a/code_ruby/lib/maasha/sam.rb b/code_ruby/lib/maasha/sam.rb index be4e9f4..cf5acda 100644 --- a/code_ruby/lib/maasha/sam.rb +++ b/code_ruby/lib/maasha/sam.rb @@ -1,4 +1,4 @@ -# Copyright (C) 2007-2011 Martin A. Hansen. +# Copyright (C) 2007-2013 Martin A. Hansen. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -29,6 +29,7 @@ require 'pp' require 'maasha/filesys' require 'maasha/seq' +require 'maasha/cigar' REGEX_HEADER = Regexp.new(/^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/) REGEX_COMMENT = Regexp.new(/^@CO\t.*/) @@ -58,12 +59,13 @@ class Sam < Filesys bp = {} bp[:REC_TYPE] = 'SAM' - bp[:Q_ID] = sam[:QNAME] - bp[:STRAND] = sam[:FLAG].revcomp? ? '-' : '+' - bp[:S_ID] = sam[:RNAME] - bp[:S_BEG] = sam[:POS] - bp[:MAPQ] = sam[:MAPQ] - bp[:CIGAR] = sam[:CIGAR] + bp[:Q_ID] = sam[:QNAME] + bp[:STRAND] = sam[:FLAG].revcomp? ? '-' : '+' + bp[:S_ID] = sam[:RNAME] + bp[:S_BEG] = sam[:POS] + bp[:S_END] = sam[:POS] + sam[:SEQ].length - 1 + bp[:MAPQ] = sam[:MAPQ] + bp[:CIGAR] = sam[:CIGAR].to_s unless sam[:RNEXT] == '*' bp[:Q_ID2] = sam[:RNEXT] @@ -74,27 +76,55 @@ class Sam < Filesys bp[:SEQ] = sam[:SEQ].seq unless sam[:SEQ].qual.nil? - bp[:SCORES] = sam[:SEQ].convert_phred2illumina!.qual + bp[:SCORES] = sam[:SEQ].qual end - if sam.has_key? :NM and sam[:NM].to_i > 0 + if sam[:NM] and sam[:NM].to_i > 0 + bp[:NM] = sam[:NM] + bp[:MD] = sam[:MD] bp[:ALIGN] = self.align_descriptors(sam) end bp end - # Create align descriptors according to the KISS format description: + # Class method to create a new SAM entry from a Biopiece record. + # FIXME + def self.new_bp(bp) + qname = bp[:Q_ID] + flag = 0 + rname = bp[:S_ID] + pos = bp[:S_BEG] + mapq = bp[:MAPQ] + cigar = bp[:CIGAR] + rnext = bp[:Q_ID2] || '*' + pnext = bp[:S_BEG2] || 0 + tlen = bp[:TLEN] || 0 + seq = bp[:SEQ] + qual = bp[:SCORES] || '*' + nm = "NM:i:#{bp[:NM]}" if bp[:NM] + md = "MD:Z:#{bp[:MD]}" if bp[:MD] + + flag |= FLAG_REVCOMP if bp[:STRAND] == '+' + + if qname && flag && rname && pos && mapq && cigar && rnext && pnext && tlen && seq && qual + ary = [qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual] + ary << nm if nm + ary << md if md + + ary.join("\t") + end + end + + # Create alignment descriptors according to the KISS + # format description: # http://code.google.com/p/biopieces/wiki/KissFormat def self.align_descriptors(sam) - offset = 0 - insertions = 0 - align = [] + offset = 0 + align = [] # Insertions - sam[:CIGAR].scan(/([0-9]+)([MIDNSHPX=])/).each do |len, op| - len = len.to_i - + sam[:CIGAR].each do |len, op| if op == 'I' (0 ... len).each_with_index do |i| nt = sam[:SEQ].seq[offset + i] @@ -113,13 +143,14 @@ class Sam < Filesys if m =~ /\d+/ # Matches offset += m.to_i elsif m[0] == '^' # Deletions - m[1 .. -1].each_char do |nt| - align << [offset, "#{nt}>-"] - - deletions += 1 - offset += 1 + m.each_char do |nt| + unless nt == '^' + align << [offset, "#{nt}>-"] + deletions += 1 + offset += 1 + end end - else # Mismatches + else # Mismatches m.each_char do |nt| nt2 = sam[:SEQ].seq[offset - deletions] @@ -130,7 +161,7 @@ class Sam < Filesys end end - align.sort_by { |a| a.first }.map { |k,v| "#{k}:#{v}" }.join(",") + align.sort_by { |a| a.first }.map { |k, v| "#{k}:#{v}" }.join(",") end # Method to initialize a Sam object. @@ -233,7 +264,7 @@ class Sam < Filesys @header[:SQ][:SN] = Hash.new unless @header[:SQ][:SN].is_a? Hash - if @header[:SQ][:SN].has_key? seq_name + if @header[:SQ][:SN][seq_name] raise SamError, "Non-unique sequence name: #{seq_name}" else @header[:SQ][:SN][seq_name] = hash @@ -261,13 +292,13 @@ class Sam < Filesys end end - if hash.has_key? :FO + if hash[:FO] unless hash[:FO] =~ /^\*|[ACMGRSVTWYHKDBN]+$/ raise SamError, "Bad flow order: #{hash[:FO]}" end end - if hash.has_key? :PL + if hash[:PL] unless hash[:PL] =~ /^(CAPILLARY|LS454|ILLUMINA|SOLID|HELICOS|IONTORRENT|PACBIO)$/ raise SamError, "Bad platform: #{hash[:PL]}" end @@ -275,7 +306,7 @@ class Sam < Filesys @header[:RG][:ID] = Hash.new unless @header[:RG][:ID].is_a? Hash - if @header[:RG][:ID].has_key? id + if @header[:RG][:ID][id] raise SamError, "Non-unique read group identifier: #{id}" else @header[:RG][:ID][id] = hash @@ -305,7 +336,7 @@ class Sam < Filesys @header[:PG][:ID] = Hash.new unless @header[:PG][:ID].is_a? Hash - if @header[:PG][:ID].has_key? id + if @header[:PG][:ID][id] raise SamError, "Non-unique program record identifier: #{id}" else @header[:PG][:ID][id] = hash @@ -346,7 +377,6 @@ class Sam < Filesys check_rname(rname) check_pos(pos) check_mapq(mapq) - check_cigar(cigar, seq) check_rnext(rnext) check_pnext(pnext) check_tlen(tlen) @@ -359,11 +389,11 @@ class Sam < Filesys entry[:RNAME] = rname entry[:POS] = pos entry[:MAPQ] = mapq - entry[:CIGAR] = cigar + entry[:CIGAR] = Cigar.new(cigar) entry[:RNEXT] = rnext entry[:PNEXT] = pnext entry[:TLEN] = tlen - entry[:SEQ] = (qual == '*') ? Seq.new(qname, seq) : Seq.new(qname, seq, qual) + entry[:SEQ] = (qual == '*') ? Seq.new(seq_name: qname, seq: seq) : Seq.new(seq_name: qname, seq: seq, qual: qual) entry[:QUAL] = qual # Optional fields - where some are really important! HATE HATE HATE SAM!!! @@ -371,7 +401,7 @@ class Sam < Filesys fields[11 .. -1].each do |field| tag, type, val = field.split(':') - raise SamError, "Non-unique optional tag: #{tag}" if entry.has_key? tag.to_sym + raise SamError, "Non-unique optional tag: #{tag}" if entry[tag.to_sym] # A [!-~] Printable character @@ -409,7 +439,7 @@ class Sam < Filesys raise SamError, "Bad rname: #{rname}" unless rname =~ /^(\*|[!-()+-<>-~][!-~]*)$/ unless @header.empty? or rname == '*' - unless @header[:SQ][:SN].has_key? rname.to_sym + unless @header[:SQ][:SN][rname.to_sym] raise SamError, "rname not found in header hash: #{rname}" end end @@ -425,44 +455,6 @@ class Sam < Filesys raise SamError, "Bad mapq: #{mapq}" unless (0 .. 2**8 - 1).include? mapq end - # Method to check cigar string. - def check_cigar(cigar, seq) - raise SamError, "Bad cigar: #{cigar}" unless cigar =~ /^(\*|([0-9]+[MIDNSHPX=])+)$/ - - unless cigar == '*' - check_cigar_hard_clip(cigar) - check_cigar_soft_clip(cigar) - check_cigar_seq_len(cigar, seq) unless seq == '*' - end - end - - # Method to check cigar hard clipping only at ends. - def check_cigar_hard_clip(cigar) - if cigar.gsub(/^[0-9]+H|[0-9]+H$/, "").match('H') - raise SamError, "Bad cigar with internal H: #{cigar}" - end - end - - # Method to check cigar soft clipping only at ends or H. - def check_cigar_soft_clip(cigar) - if cigar.gsub(/^[0-9]+H|[0-9]+H$/, "").gsub(/^[0-9]+S|[0-9]+S$/, "").match('S') - raise SamError, "Bad cigar with internal S: #{cigar}" - end - end - - # Method to check cigar length matches sequence length. - def check_cigar_seq_len(cigar, seq) - cigar_len = 0 - - cigar.scan(/([0-9]+)([MIDNSHPX=])/).each do |len, op| - cigar_len += len.to_i if op =~ /[MISX=]/ - end - - if cigar_len != seq.length - raise SamError, "cigar and sequence length mismatch: #{cigar_len} != #{seq.length}" - end - end - # Method to check if rnext, when not '*' or '=' # and @SQ header lines are present, is located # in the header hash. @@ -470,7 +462,7 @@ class Sam < Filesys raise SamError, "Bad rnext: #{rnext}" unless rnext =~ /^(\*|=|[!-()+-<>-~][!-~]*)$/ unless @header.empty? or rnext == '*' or rnext == '=' - unless @header[:SQ][:SN].has_key? rnext.to_sym + unless @header[:SQ][:SN][rnext.to_sym] raise SamError, "rnext not found in header hash: #{rnext}" end end @@ -508,58 +500,58 @@ class Sam < Filesys # Method to test if template have # multiple fragments in sequencing. def multi? - flag & FLAG_MULTI + (flag & FLAG_MULTI) == 0 end # Method to test if each fragment # properly aligned according to the aligner. def aligned? - flag & FLAG_ALIGNED + (flag & FLAG_ALIGNED) == 0 end # Method to test if the fragment was unmapped. def unmapped? - flag & FLAG_UNMAPPED + (flag & FLAG_UNMAPPED) == 0 end # Method to test if the next fragment was unmapped. def next_unmapped? - flag & FLAG_NEXT_UNMAPPED + (flag & FLAG_NEXT_UNMAPPED) == 0 end # Method to test if the fragment was reverse complemented. def revcomp? - flag & FLAG_REVCOMP + (flag & FLAG_REVCOMP) == 0 end # Method to test if the next fragment was reverse complemented. def next_revcomp? - flag & FLAG_NEXT_REVCOMP + (flag & FLAG_NEXT_REVCOMP) == 0 end # Method to test if the fragment was first in the template. def first? - flag & FLAG_FIRST + (flag & FLAG_FIRST) == 0 end # Method to test if the fragment was last in the template. def last? - flag & FLAG_LAST + (flag & FLAG_LAST) == 0 end # Method to test for secondary alignment. def secondary_alignment? - flag & FLAG_SECONDARY_ALIGNMENT + (flag & FLAG_SECONDARY_ALIGNMENT) == 0 end # Method to test for quality fail. def quality_fail? - flag & FLAG_QUALITY_FAIL + (flag & FLAG_QUALITY_FAIL) == 0 end # Method to test for PCR or optical duplicates. def duplicates? - flag & FLAG_DUPLICATES + (flag & FLAG_DUPLICATES) == 0 end end end