# Error class for all exceptions to do with CIGAR.
class CigarError < StandardError; end
-# Class to parse and write SAM files.
+# Class to manipulate CIGAR strings.
class Cigar
attr_accessor :cigar
+ # Method to initialize a CIGAR string.
def initialize(cigar)
@cigar = cigar
check_cigar
end
+ def to_s
+ @cigar
+ end
+
+ # Method to iterate over the length and operators
+ # in a CIGAR string.
def each
cigar.scan(/(\d+)([MIDNSHPX=])/).each do |len, op|
yield [len.to_i, op]
end
end
+ # Method to return the number length of
+ # the residues described in the CIGAR string.
+ # This length should match the length of the
+ # aligned sequence.
def length
total = 0
total
end
+ # Method to return the number of matched
+ # residues in the CIGAR string.
def matches
total = 0
total
end
+ # Method to return the number of inserted
+ # residues in the CIGAR string.
def insertions
total = 0
total
end
+ # Method to return the number of deleted
+ # residues in the CIGAR string.
def deletions
total = 0
total
end
+ # Method to return the number of hard clipped
+ # residues in the CIGAR string.
def clip_hard
total = 0
total
end
+ # Method to return the number of soft clipped
+ # residues in the CIGAR string.
def clip_soft
total = 0
private
+ # Method to check that the CIGAR string is formatted
+ # correctly, including hard clipping and soft clipping
+ # can't be located internally.
def check_cigar
unless cigar =~ /^(\*|([0-9]+[MIDNSHPX=])+)$/
raise CigarError, "Bad cigar format: #{cigar}"
require 'pp'
require 'maasha/filesys'
require 'maasha/seq'
+require 'maasha/cigar'
REGEX_HEADER = Regexp.new(/^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/)
REGEX_COMMENT = Regexp.new(/^@CO\t.*/)
# http://code.google.com/p/biopieces/wiki/KissFormat
def self.align_descriptors(sam)
offset = 0
- insertions = 0
align = []
# Insertions
- sam[:CIGAR].scan(/([0-9]+)([MIDNSHPX=])/).each do |len, op|
- len = len.to_i
-
+ sam[:CIGAR].each do |len, op|
if op == 'I'
(0 ... len).each_with_index do |i|
nt = sam[:SEQ].seq[offset + i]
check_rname(rname)
check_pos(pos)
check_mapq(mapq)
- check_cigar(cigar, seq)
check_rnext(rnext)
check_pnext(pnext)
check_tlen(tlen)
entry[:RNAME] = rname
entry[:POS] = pos
entry[:MAPQ] = mapq
- entry[:CIGAR] = cigar
+ entry[:CIGAR] = Cigar.new(cigar)
entry[:RNEXT] = rnext
entry[:PNEXT] = pnext
entry[:TLEN] = tlen
raise SamError, "Bad mapq: #{mapq}" unless (0 .. 2**8 - 1).include? mapq
end
- # Method to check cigar string.
- def check_cigar(cigar, seq)
- raise SamError, "Bad cigar: #{cigar}" unless cigar =~ /^(\*|([0-9]+[MIDNSHPX=])+)$/
-
- unless cigar == '*'
- check_cigar_hard_clip(cigar)
- check_cigar_soft_clip(cigar)
- check_cigar_seq_len(cigar, seq) unless seq == '*'
- end
- end
-
- # Method to check cigar hard clipping only at ends.
- def check_cigar_hard_clip(cigar)
- if cigar.gsub(/^[0-9]+H|[0-9]+H$/, "").match('H')
- raise SamError, "Bad cigar with internal H: #{cigar}"
- end
- end
-
- # Method to check cigar soft clipping only at ends or H.
- def check_cigar_soft_clip(cigar)
- if cigar.gsub(/^[0-9]+H|[0-9]+H$/, "").gsub(/^[0-9]+S|[0-9]+S$/, "").match('S')
- raise SamError, "Bad cigar with internal S: #{cigar}"
- end
- end
-
- # Method to check cigar length matches sequence length.
- def check_cigar_seq_len(cigar, seq)
- cigar_len = 0
-
- cigar.scan(/([0-9]+)([MIDNSHPX=])/).each do |len, op|
- cigar_len += len.to_i if op =~ /[MISX=]/
- end
-
- if cigar_len != seq.length
- raise SamError, "cigar and sequence length mismatch: #{cigar_len} != #{seq.length}"
- end
- end
-
# Method to check if rnext, when not '*' or '='
# and @SQ header lines are present, is located
# in the header hash.
def convert_phred2illumina!
self.qual.gsub!(/./) do |score|
score_phred = score.ord - SCORE_PHRED
- raise SeqError, "Bad Phred score: #{score} (#{score_phred})" unless (0 .. 40).include? score_phred
+ raise SeqError, "Bad Phred score: #{score} (#{score_phred})" unless (0 .. 41).include? score_phred
score_illumina = score_phred + SCORE_ILLUMINA
score = score_illumina.chr
end