From 6dce9c0b201b3cef810231cd7baacf4057a94d5b Mon Sep 17 00:00:00 2001 From: martinahansen Date: Wed, 31 Aug 2011 12:41:48 +0000 Subject: [PATCH] worked on sam - added 1 to phred score check in seq.rb git-svn-id: http://biopieces.googlecode.com/svn/trunk@1503 74ccb610-7750-0410-82ae-013aeee3265d --- code_ruby/lib/maasha/cigar.rb | 26 ++++++++++++++++++- code_ruby/lib/maasha/sam.rb | 47 +++-------------------------------- code_ruby/lib/maasha/seq.rb | 2 +- 3 files changed, 29 insertions(+), 46 deletions(-) diff --git a/code_ruby/lib/maasha/cigar.rb b/code_ruby/lib/maasha/cigar.rb index 765a4fa..2e6fd0a 100644 --- a/code_ruby/lib/maasha/cigar.rb +++ b/code_ruby/lib/maasha/cigar.rb @@ -32,22 +32,33 @@ require 'pp' # Error class for all exceptions to do with CIGAR. class CigarError < StandardError; end -# Class to parse and write SAM files. +# Class to manipulate CIGAR strings. class Cigar attr_accessor :cigar + # Method to initialize a CIGAR string. def initialize(cigar) @cigar = cigar check_cigar end + def to_s + @cigar + end + + # Method to iterate over the length and operators + # in a CIGAR string. def each cigar.scan(/(\d+)([MIDNSHPX=])/).each do |len, op| yield [len.to_i, op] end end + # Method to return the number length of + # the residues described in the CIGAR string. + # This length should match the length of the + # aligned sequence. def length total = 0 @@ -58,6 +69,8 @@ class Cigar total end + # Method to return the number of matched + # residues in the CIGAR string. def matches total = 0 @@ -68,6 +81,8 @@ class Cigar total end + # Method to return the number of inserted + # residues in the CIGAR string. def insertions total = 0 @@ -78,6 +93,8 @@ class Cigar total end + # Method to return the number of deleted + # residues in the CIGAR string. def deletions total = 0 @@ -88,6 +105,8 @@ class Cigar total end + # Method to return the number of hard clipped + # residues in the CIGAR string. def clip_hard total = 0 @@ -98,6 +117,8 @@ class Cigar total end + # Method to return the number of soft clipped + # residues in the CIGAR string. def clip_soft total = 0 @@ -110,6 +131,9 @@ class Cigar private + # Method to check that the CIGAR string is formatted + # correctly, including hard clipping and soft clipping + # can't be located internally. def check_cigar unless cigar =~ /^(\*|([0-9]+[MIDNSHPX=])+)$/ raise CigarError, "Bad cigar format: #{cigar}" diff --git a/code_ruby/lib/maasha/sam.rb b/code_ruby/lib/maasha/sam.rb index 79aa5ad..272d8fc 100644 --- a/code_ruby/lib/maasha/sam.rb +++ b/code_ruby/lib/maasha/sam.rb @@ -29,6 +29,7 @@ require 'pp' require 'maasha/filesys' require 'maasha/seq' +require 'maasha/cigar' REGEX_HEADER = Regexp.new(/^@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/) REGEX_COMMENT = Regexp.new(/^@CO\t.*/) @@ -88,13 +89,10 @@ class Sam < Filesys # http://code.google.com/p/biopieces/wiki/KissFormat def self.align_descriptors(sam) offset = 0 - insertions = 0 align = [] # Insertions - sam[:CIGAR].scan(/([0-9]+)([MIDNSHPX=])/).each do |len, op| - len = len.to_i - + sam[:CIGAR].each do |len, op| if op == 'I' (0 ... len).each_with_index do |i| nt = sam[:SEQ].seq[offset + i] @@ -346,7 +344,6 @@ class Sam < Filesys check_rname(rname) check_pos(pos) check_mapq(mapq) - check_cigar(cigar, seq) check_rnext(rnext) check_pnext(pnext) check_tlen(tlen) @@ -359,7 +356,7 @@ class Sam < Filesys entry[:RNAME] = rname entry[:POS] = pos entry[:MAPQ] = mapq - entry[:CIGAR] = cigar + entry[:CIGAR] = Cigar.new(cigar) entry[:RNEXT] = rnext entry[:PNEXT] = pnext entry[:TLEN] = tlen @@ -425,44 +422,6 @@ class Sam < Filesys raise SamError, "Bad mapq: #{mapq}" unless (0 .. 2**8 - 1).include? mapq end - # Method to check cigar string. - def check_cigar(cigar, seq) - raise SamError, "Bad cigar: #{cigar}" unless cigar =~ /^(\*|([0-9]+[MIDNSHPX=])+)$/ - - unless cigar == '*' - check_cigar_hard_clip(cigar) - check_cigar_soft_clip(cigar) - check_cigar_seq_len(cigar, seq) unless seq == '*' - end - end - - # Method to check cigar hard clipping only at ends. - def check_cigar_hard_clip(cigar) - if cigar.gsub(/^[0-9]+H|[0-9]+H$/, "").match('H') - raise SamError, "Bad cigar with internal H: #{cigar}" - end - end - - # Method to check cigar soft clipping only at ends or H. - def check_cigar_soft_clip(cigar) - if cigar.gsub(/^[0-9]+H|[0-9]+H$/, "").gsub(/^[0-9]+S|[0-9]+S$/, "").match('S') - raise SamError, "Bad cigar with internal S: #{cigar}" - end - end - - # Method to check cigar length matches sequence length. - def check_cigar_seq_len(cigar, seq) - cigar_len = 0 - - cigar.scan(/([0-9]+)([MIDNSHPX=])/).each do |len, op| - cigar_len += len.to_i if op =~ /[MISX=]/ - end - - if cigar_len != seq.length - raise SamError, "cigar and sequence length mismatch: #{cigar_len} != #{seq.length}" - end - end - # Method to check if rnext, when not '*' or '=' # and @SQ header lines are present, is located # in the header hash. diff --git a/code_ruby/lib/maasha/seq.rb b/code_ruby/lib/maasha/seq.rb index a3fe789..3b950ae 100644 --- a/code_ruby/lib/maasha/seq.rb +++ b/code_ruby/lib/maasha/seq.rb @@ -335,7 +335,7 @@ class Seq def convert_phred2illumina! self.qual.gsub!(/./) do |score| score_phred = score.ord - SCORE_PHRED - raise SeqError, "Bad Phred score: #{score} (#{score_phred})" unless (0 .. 40).include? score_phred + raise SeqError, "Bad Phred score: #{score} (#{score_phred})" unless (0 .. 41).include? score_phred score_illumina = score_phred + SCORE_ILLUMINA score = score_illumina.chr end -- 2.39.2